knitr::opts_chunk$set(echo = TRUE)
#empty environment before starting the script
rm(list=ls())
# Load necessary libraries
library(googlesheets4)
library(googledrive)
##
## Attaching package: 'googledrive'
## The following objects are masked from 'package:googlesheets4':
##
## request_generate, request_make
library(RColorBrewer) # plotting (color palettes)
library(paletteer) # plotting (color palettes)
library(gridExtra) # plotting (arrange multiple plots together)
library(patchwork) # plotting (combine multiple plots)
library(corrplot) # plotting (correlation plots)
## corrplot 0.95 loaded
library(ggpubr) # plotting (publication-ready plots)
## Loading required package: ggplot2
library(lubridate) # date and time data
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(hms) # time data
##
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
##
## hms
library(Hmisc) # used for correlation analysis
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
##
## area
library(lme4) # GLMM (esp Poisson)
## Loading required package: Matrix
library(glmmTMB) # GLMM (esp negbin)
library(DHARMa) # checking GLMM assumptions
## This is DHARMa 0.4.7. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
library(easystats) # GLMM assumptions and model performance
## # Attaching packages: easystats 0.7.4 (red = needs update)
## ✔ bayestestR 0.15.2 ✔ correlation 0.8.7
## ✖ datawizard 1.0.1 ✔ effectsize 1.0.0
## ✔ insight 1.1.0 ✔ modelbased 0.10.0
## ✔ performance 0.13.0 ✔ parameters 0.24.2
## ✔ report 0.6.1 ✔ see 0.11.0
##
## Restart the R-Session and update packages with `easystats::easystats_update()`.
library(broom.mixed) # tidying mixed models results
library(ggeffects) # plot model predictions
##
## Attaching package: 'ggeffects'
## The following object is masked from 'package:modelbased':
##
## pool_predictions
## The following object is masked from 'package:easystats':
##
## install_latest
library(emmeans) # EMMs (used for post hoc tests)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(multcompView) # plot results of post hoc text
library(sjPlot) # effect sizes plot
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(car) # anova type 2 & 3 tables
## Loading required package: carData
library(tidyverse) # data wrangling
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ stringr 1.5.1
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ hms::hms() masks lubridate::hms()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks car::recode()
## ✖ googledrive::request_generate() masks googlesheets4::request_generate()
## ✖ googledrive::request_make() masks googlesheets4::request_make()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some() masks car::some()
## ✖ dplyr::src() masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
In this document I go through each of my research questions and perform all statistical analyses necessary to answer them.
Before going into each research question, I load and prepare all the necessary data.
The three main data tables (environmental data and information on experiment start time/location/etc., species identification data, experiment data with amount of ants observed at different time points) are loaded and the variables are reformatted as needed.
# loading the google sheet containing the experiment data
exp_data <- drive_get("experiment_data_all_years") %>%
read_sheet(, sheet = 2)
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googledrive package is using a cached token for
## 'nelma.peclard@gmail.com'.
## Auto-refreshing stale OAuth token.
## ✔ The input `path` resolved to exactly 1 file.
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for
## 'nelma.peclard@gmail.com'.
## Auto-refreshing stale OAuth token.
## ✔ Reading from "experiment_data_all_years".
## ✔ Range ''all_years''.
exp_data$count_time <- as.character(exp_data$count_time) # turning the count_time column into strings
# loading the google sheet containing the environmental data
env_data <- drive_get("full_env_data_all_years") %>%
read_sheet(, sheet = 2)
## ✔ The input `path` resolved to exactly 1 file.
## ✔ Reading from "full_env_data_all_years".
## ✔ Range ''all_years''.
env_data$date_time <- dmy_hm(paste0(env_data$date, sep = " ", env_data$start_time))
## Warning: 4 failed to parse.
env_data$date <- dmy(env_data$date) # turning the date column into a date object
env_data$start_time[!is.na(env_data$start_time)] <- paste0(env_data$start_time[!is.na(env_data$start_time)], ":00")
env_data$start_time <- as_hms(env_data$start_time) # turning the start_time column into a hh:mm:ss time object
env_data$end_time[!is.na(env_data$end_time)] <- paste0(env_data$end_time[!is.na(env_data$end_time)], ":00")
env_data$end_time <- as_hms(env_data$end_time) # turning the end_time column into a hh:mm:ss time object
# loading the google sheet containing the identified species data
species_data <- drive_get("species_data_all_years") %>%
read_sheet(, sheet = 2)
## ✔ The input `path` resolved to exactly 1 file.
## ✔ Reading from "species_data_all_years".
## ✔ Range ''all_years''.
For most of the analyses, I need the total number of ants found on all baits of one experiment at the end of the experiment.
# first calculation of the total number of ants counted after 120 min, after freezing, and in the lab before identification
tmp_exp_totals <- exp_data %>%
filter(count_time %in% c("120", "Frozen", "Lab")) %>% # Keep only relevant time points
group_by(sample_ID, count_time) %>% # Group by experiment and time point
summarise(
total_ants = if_else(all(is.na(individuals)), NA, sum(individuals, na.rm = TRUE)),
.groups = "drop") %>%
pivot_wider(names_from = count_time, values_from = total_ants, names_prefix = "total_") # Convert to wide format
# comparing the number of experiments where all values were NA between 120 min, after freezing, in the lab
length(tmp_exp_totals$total_120[is.na(tmp_exp_totals$total_120)])
## [1] 11
length(tmp_exp_totals$total_Frozen[is.na(tmp_exp_totals$total_Frozen)])
## [1] 56
length(tmp_exp_totals$total_Lab[is.na(tmp_exp_totals$total_Lab)])
## [1] 42
There are a lot more rows with missing values in the total ants counted after freezing or in the lab than at 120 minutes. This is very likely in part due to participants not reporting 0s after freezing on experiments where no ant was collected, and not sending back the empty ethanol vials.
I filtered the data table on Google Sheets and found that the number of experiments which have 0 ants at 120 minutes for all sugar concentrations but all missing values for lab counting is 13. Since participants wrote that there were 0 ants on all concentrations of the experiment at 120 minutes (the end of the experiment), I can consider that there were no ants to be sent back and counted in the lab. In those specific cases, I replace the NAs recorded in the lab by 0s. I do not modify experiments where one or more of the concentration had anything other than 0 (whether NA or positive value) ants at 120 minutes.
# extract experiments where all concentrations had 0 ants at 120 and lab has all NAs
# for each experiment, check that at count_time "120" the individuals count is 0 for every concentration.
tmp_exp_zero_120 <- exp_data %>%
filter(count_time == "120") %>% # select only rows at 120 minutes
group_by(sample_ID) %>%
summarise(
n_obs = n(), # Count number of rows (should be 5 i.e. all 5 baits had no ants)
all_zero = all(individuals == 0) # Will be NA if any value is NA
) %>%
filter(n_obs == 5, all_zero == TRUE) %>% # Keep only experiments with 5 observations all equal to 0
pull(sample_ID)
# for each experiment, filter those with NA at each concentration at count_time "lab"
tmp_exp_na_lab <- exp_data %>%
filter(count_time == "Lab") %>%
group_by(sample_ID) %>%
summarise(all_na = all(is.na(individuals))) %>% # find all NA totals counted in the lab
filter(all_na) %>% # only keep NA totals
pull(sample_ID)
# find experiments where both all 120 counts = 0 and all lab counts = NA
tmp_experiments_0_na <- intersect(tmp_exp_zero_120, tmp_exp_na_lab)
# create new data frame with modified values for lab counts
corr_exp_data <- exp_data %>%
mutate(individuals = if_else(
count_time == "Lab" & sample_ID %in% tmp_experiments_0_na & is.na(individuals),
0, # replace NA with 0 for lab samples from exp in the list above with all NA values
individuals)) # otherwise, keep the original value
I can now recalculate the total number of ants per experiment.
exp_totals <- corr_exp_data %>%
filter(count_time %in% c("120", "Frozen", "Lab")) %>% # Keep only relevant time points
group_by(sample_ID, count_time) %>% # Group by experiment and time point
summarise(
total_ants = if_else(all(is.na(individuals)), NA, sum(individuals, na.rm = TRUE)), # if all values are NA, keep NA, if at least 1 value is real, sum
.groups = "drop") %>%
pivot_wider(names_from = count_time, values_from = total_ants, names_prefix = "total_") # Convert to wide format
head(exp_totals)
## # A tibble: 6 × 4
## sample_ID total_120 total_Frozen total_Lab
## <chr> <dbl> <dbl> <dbl>
## 1 22-10000 0 0 0
## 2 22-11002 31 40 40
## 3 22-11003 12 NA NA
## 4 22-11004 3 NA 2
## 5 22-11005 0 NA 0
## 6 22-11006 6 NA 0
I can also obtain the total of ants from each experiment from the species data, which should correspond to the lab count totals. The species data only has rows for experiments which had ants, so the experiments where 0 ants were found need to be added.
The totals per concentration for each experiment can also be extracted from the species data and the lab count data (to add the values for the experiments with 0 ants).
# remove non-ants from the species data
corr_species_data <- species_data[species_data$family != "non-ant",]
## totals per experiment
# extract the total number of ants of all species per experiment
tmp_sp_totals <- corr_species_data %>%
group_by(sample_ID) %>% # Group by experiment
summarise(total_ants_sp = sum(individuals),
.groups = "drop")
nrow(tmp_sp_totals); nrow(env_data) # sp_totals has less rows because many exp had no ants
## [1] 142
## [1] 239
# intermediary check that the total of ants is the same as in the initial data frame
sum(tmp_sp_totals$total_ants_sp, na.rm = T) == sum(corr_species_data$individuals, na.rm = T)
## [1] TRUE
# Create a complete data frame with all sample_IDs
sp_totals <- expand.grid(sample_ID = unique(env_data$sample_ID)) # take sample ID column from env_data to have the full 239 experiments
# Merge complete data frame with ant totals
sp_totals <- left_join(sp_totals, tmp_sp_totals, by = c("sample_ID"))
# from exp_totals, filter experiments where the lab total count was 0 or NA
tmp_exp_totals_0_na <- exp_totals %>%
filter(total_Lab == 0 | is.na(total_Lab)) %>%
dplyr::select(sample_ID, total_Lab)
# update ant counts in species totals with exp where lab count is 0 or NA
sp_totals <- sp_totals %>%
left_join(tmp_exp_totals_0_na, by = c("sample_ID")) %>%
mutate(total_ants_sp = ifelse(!is.na(total_Lab) & total_Lab == 0 & is.na(total_ants_sp), 0, total_ants_sp)) %>%
dplyr::select(sample_ID, total_ants_sp)
# check sum of individuals is the same as in corr_species_data & we have the same sample IDs as in env_data
sum(sp_totals$total_ants_sp, na.rm = T) == sum(corr_species_data$individuals, na.rm = T)
## [1] TRUE
all(unique(sp_totals$sample_ID) == env_data$sample_ID)
## [1] TRUE
# compare number of rows with total = NA or total = 0
nrow(sp_totals[is.na(sp_totals$total_ants_sp),]); nrow(exp_totals[is.na(exp_totals$total_Lab),])
## [1] 30
## [1] 29
nrow(na.omit(sp_totals[sp_totals$total_ants_sp == 0,])) == nrow(exp_totals[exp_totals$total_Lab == 0 & !is.na(exp_totals$total_Lab),])
## [1] TRUE
# 1 more row with NA in sp_totals than exp totals: check if samples not in the species dataset have ants
na.omit(exp_totals[exp_totals$sample_ID %in% setdiff(env_data$sample_ID, tmp_sp_totals$sample_ID) & exp_totals$total_Lab > 0, c(1,4)])
## # A tibble: 1 × 2
## sample_ID total_Lab
## <chr> <dbl>
## 1 22-11009 2
# exp 22-11009 has ants counted in the lab which were not identified and will show as NA in sp_totals
## totals per bait
# extract the total number of ants of all species per experiment and bait
tmp_sp_totals_bait <- corr_species_data %>%
group_by(sample_ID, concentration) %>% # Group by experiment ant concentration
summarise(total_ants_sp_bait = sum(individuals),
.groups = "drop")
# intermediary check that the total of ants is the same as in the initial data frame
sum(tmp_sp_totals_bait$total_ants_sp_bait, na.rm = T) == sum(corr_species_data$individuals, na.rm = T)
## [1] TRUE
# Create a complete data frame with all combinations of sample ID and concentration
sp_totals_bait <- expand.grid(
sample_ID = unique(env_data$sample_ID), # take sample ID column from env_data to have the full 239 experiments
concentration = unique(corr_exp_data$concentration)) # all concentrations from 0 to 40
# Merge complete data frame with ant bait totals
sp_totals_bait <- left_join(sp_totals_bait, tmp_sp_totals_bait, by = c("sample_ID", "concentration")) %>%
arrange(sample_ID, concentration)
# from corr_exp_data, filter experiments where the lab total count was 0 or NA
tmp_exp_0_na <- corr_exp_data %>%
filter(count_time == "Lab" & (individuals == 0 | is.na(individuals))) %>%
dplyr::select(sample_ID, concentration, individuals) %>%
rename(lab_count = individuals)
# update ant counts in bait totals with exp where lab count is 0 or NA
sp_totals_bait <- sp_totals_bait %>%
left_join(tmp_exp_0_na, by = c("sample_ID", "concentration")) %>%
mutate(total_ants_sp_bait = ifelse(!is.na(lab_count) & lab_count == 0 & is.na(total_ants_sp_bait), 0, total_ants_sp_bait)) %>%
dplyr::select(sample_ID, concentration, total_ants_sp_bait) %>%
arrange(sample_ID, concentration)
# check sum of individuals is the same as in corr_species_data & we have the same sample IDs as in env_data
sum(sp_totals_bait$total_ants_sp_bait, na.rm = T) == sum(corr_species_data$individuals, na.rm = T)
## [1] TRUE
all(unique(sp_totals_bait$sample_ID) == env_data$sample_ID)
## [1] TRUE
# compare number of rows with total = NA or total = 0
nrow(sp_totals_bait[is.na(sp_totals_bait$total_ants_sp_bait),]); nrow(tmp_exp_0_na[is.na(tmp_exp_0_na$lab_count),])
## [1] 170
## [1] 169
nrow(na.omit(sp_totals_bait[sp_totals_bait$total_ants_sp_bait == 0,])) == nrow(tmp_exp_0_na[tmp_exp_0_na$lab_count == 0 & !is.na(tmp_exp_0_na$lab_count),])
## [1] TRUE
# 1 more row with NA in sp_totals_bait than exp totals: check if samples not in the species dataset have ants
na.omit(corr_exp_data[corr_exp_data$sample_ID %in% setdiff(env_data$sample_ID, tmp_sp_totals_bait$sample_ID) & corr_exp_data$count_time == "Lab" & corr_exp_data$individuals > 0, c(1:2,4)])
## # A tibble: 1 × 3
## sample_ID concentration individuals
## <chr> <dbl> <dbl>
## 1 22-11009 10 2
# also exp 22-11009 has ants counted in the lab which were not identified and will show as NA in sp_totals
Experiment 22-11009 had 2 ants counted in the lab but no ants in the species data, meaning the species was not identified because the sample was not found. As it stands, this row is reported as NA in the species totals.
I compare the species data with the experiment data to see whether the total number of ants for each experiment matches.
## comparing ant totals per experiment
# add the total_lab column calculated previously
tmp_totals_comparison <- cbind(sp_totals, exp_totals$total_Lab)
# extract experiments where the total from the lab counting and the total from the species data does not match
exp_problem_totals <- tmp_totals_comparison[tmp_totals_comparison[,2] != tmp_totals_comparison[,3],]
exp_problem_totals <- exp_problem_totals[!is.na(exp_problem_totals$sample_ID),] # remove NA rows
# compare total ants from both sources
sum(tmp_totals_comparison[,3], na.rm = T); sum(tmp_totals_comparison$total_ants_sp, na.rm = T)
## [1] 5468
## [1] 5457
# percentage of difference between the 2 totals
1-(sum(tmp_totals_comparison$total_ants_sp, na.rm = T)/sum(tmp_totals_comparison[,3], na.rm = T))
## [1] 0.002011704
## comparing totals at each bait
# from corr_exp_data, filter lab totals at each concentration
tmp_exp_lab_counts <- corr_exp_data %>%
filter(count_time == "Lab") %>%
dplyr::select(sample_ID, concentration, individuals) %>% # keep only relevant columns
rename(lab_count = individuals) # rename the individuals column
tmp_totals_bait_comparison <- cbind(sp_totals_bait, tmp_exp_lab_counts$lab_count)
# compare total ants from both sources
sum(tmp_totals_bait_comparison$total_ants_sp_bait, na.rm = T); sum(tmp_totals_bait_comparison[,4], na.rm = T)
## [1] 5457
## [1] 5468
# extract experiments where the total from the lab counting and the total from the species data does not match
baits_problem_totals <- tmp_totals_bait_comparison[tmp_totals_bait_comparison[,4] != tmp_totals_bait_comparison[,3],]
baits_problem_totals <- baits_problem_totals[!is.na(baits_problem_totals$sample_ID),] # remove NA rows
There are some minor discrepancies (less than 1% of the total number of ants) between the lab total and the species data total. Since a higher proportion of the species data was counted using a click counter, I consider the species data counts to be more reliable and use these as the variable for analysis over the lab totals when working with the number of ants collected at the end of the experiment.
In order to keep my global environment as clean as possible and without too many variables with similar names at once, I periodically remove all variables labeled as temporary (name starts with prefix “tmp_”) from the global environment.
rm(list = ls(pattern = "^tmp_"))
The number of ants collected at the end of each experiment does not account for absolute differences in abundance due to ant colony size or foraging strategy (e.g., solitary foraging vs mass recruiting). Both are important features of ant foraging and vary between species (among other factors).
In order to account for this as best as possible, I create scaled measures of ant activity which account for these absolute differences in abundances and which are appropriate for each research question.
# sum all ants of the same species of each experiment together
tmp_species_per_exp <- corr_species_data %>%
group_by(sample_ID, sp_abbr) %>% # for now we group by abbreviated species name
dplyr::summarize(individuals = sum(individuals, na.rm = T), .groups = "drop")
# List of all unique species
tmp_all_species <- unique(corr_species_data$sp_abbr)
# Create an empty data frame with 1 species per exp
species_per_exp <- expand.grid(
sample_ID = unique(env_data$sample_ID), # take sample ID column from env_data to have all 239 experiments
sp_abbr = tmp_all_species) %>%
arrange(sample_ID)
# Merge this complete data frame with species counts
species_per_exp <- left_join(species_per_exp, tmp_species_per_exp,
by = c("sample_ID", "sp_abbr"))
# update species table with rows where no ants were found
species_per_exp <- species_per_exp %>%
left_join(sp_totals, by = c("sample_ID")) %>% # join species totals calculated in chunk 0.2.1.4
mutate(individuals = case_when(
is.na(individuals) & total_ants_sp == 0 ~ 0, # True zero: No ants on this bait
is.na(individuals) & !is.na(total_ants_sp) ~ 0, # Species not found → Absence is zero
is.na(individuals) & is.na(total_ants_sp) ~ NA, # Preserve true missing values
.default = individuals)) %>% # Keep original values )) %>%
dplyr::select(-total_ants_sp) # Drop extra column
# check sum of individuals is the same as in corr_species_data & all sample IDs from env_data are there
sum(species_per_exp$individuals, na.rm = T) == sum(corr_species_data$individuals, na.rm = T)
## [1] TRUE
all(unique(species_per_exp$sample_ID) == env_data$sample_ID)
## [1] TRUE
# compare number of rows where total = NA with the sp_totals (nrow with NAs/number of species)
nrow(species_per_exp[is.na(species_per_exp$individuals),])/length(tmp_all_species) == nrow(sp_totals[is.na(sp_totals$total_ants_sp),])
## [1] TRUE
all(unique(species_per_exp$sample_ID[is.na(species_per_exp$individuals)]) == sp_totals$sample_ID[is.na(sp_totals$total_ants_sp)])
## [1] TRUE
# extract maximum number of ants of each species found on any experiment
scaled_species_exp <- species_per_exp %>%
group_by(sp_abbr) %>%
mutate(species_max = max(individuals, na.rm = T)) %>% # add new column with max individuals of that genus across baits
ungroup() # remove grouping by species
# add column with scaled ant total per species
scaled_species_exp <- scaled_species_exp %>%
mutate(scaled_species_total = (individuals/species_max))
# sum all ants of the same species on each bait of the same experiment together
tmp_species_per_bait <- corr_species_data %>%
group_by(sample_ID, concentration, sp_abbr) %>% # for now we group by abbreviated species name
dplyr::summarize(individuals = sum(individuals, na.rm = T), .groups = "drop")
# List of all unique species
tmp_all_species <- unique(corr_species_data$sp_abbr)
# Create an empty data frame with 1 species per concentration per exp.
species_per_bait <- expand.grid(
sample_ID = unique(env_data$sample_ID), # take sample ID column from env_data to have all 239 experiments
concentration = unique(corr_exp_data$concentration), # take concentration column from exp data to have all 5 []
sp_abbr = tmp_all_species)
# Merge this complete data frame with species counts
species_per_bait <- left_join(species_per_bait, tmp_species_per_bait,
by = c("sample_ID", "concentration", "sp_abbr")) %>%
arrange(sample_ID, concentration)
# update species table with rows where no ants were found
species_per_bait <- species_per_bait %>%
left_join(sp_totals_bait, by = c("sample_ID", "concentration")) %>% # join species totals per bait calculated in chunk 0.2.4
mutate(individuals = case_when(
is.na(individuals) & total_ants_sp_bait == 0 ~ 0, # True zero: No ants on this bait
is.na(individuals) & !is.na(total_ants_sp_bait) ~ 0, # Species not found → Absence is zero
is.na(individuals) & is.na(total_ants_sp_bait) ~ NA, # Preserve true missing values
.default = individuals)) %>% # Keep original values )) %>%
dplyr::select(-total_ants_sp_bait) # Drop extra column
# check sum of individuals is the same as in corr_species_data & all sample IDs from env_data are there
sum(species_per_bait$individuals, na.rm = T) == sum(corr_species_data$individuals, na.rm = T)
## [1] TRUE
all(unique(species_per_bait$sample_ID) == env_data$sample_ID)
## [1] TRUE
# compare number of rows where total = NA with the sp_totals (nrow with NAs/number of species)
nrow(species_per_bait[is.na(species_per_bait$individuals),])/length(tmp_all_species) == nrow(sp_totals_bait[is.na(sp_totals_bait$total_ants_sp_bait),])
## [1] TRUE
all(unique(species_per_bait$sample_ID[is.na(species_per_bait$individuals)]) == unique(sp_totals_bait$sample_ID[is.na(sp_totals_bait$total_ants_sp_bait)]))
## [1] TRUE
# calculate species total per bait
tmp_sp_bait_total <- species_per_bait %>%
group_by(sample_ID, concentration) %>%
dplyr::summarize(individuals = sum(individuals), .groups = "drop")
# control that species_per_bait has the same totals as species totals per bait
which(tmp_sp_bait_total$individuals != sp_totals_bait$total_ants_sp_bait)
## integer(0)
# extract total number of ants of each species found on any experiment
scaled_species_bait <- species_per_bait %>%
group_by(sample_ID, sp_abbr) %>%
mutate(species_total_exp = ifelse(all(is.na(individuals)), NA, sum(individuals, na.rm = TRUE)), # add new column with total individuals of that species per experiment, keep all values as NA if the entire experiment has missing values
scaled_species_total = case_when(
!is.na(individuals) & species_total_exp == 0 ~ 0, # avoid divisions by 0
is.na(individuals) ~ NA, # keep NA values
.default = individuals / species_total_exp)) %>% # scaled value
ungroup() # remove grouping by species
scaled_totals_bait <- sp_totals_bait %>%
group_by(sample_ID) %>%
mutate(ant_total_exp = ifelse(all(is.na(total_ants_sp_bait)), NA, sum(total_ants_sp_bait, na.rm = TRUE)), # add new column with total individuals per experiment, keep all values as NA if the entire experiment has missing values
scaled_ant_total = case_when(
!is.na(total_ants_sp_bait) & ant_total_exp == 0 ~ 0, # avoid divisions by 0
is.na(total_ants_sp_bait) ~ NA, # keep NA values
.default = total_ants_sp_bait / ant_total_exp)) %>% # scaled value
ungroup() # remove grouping by species
# select relevant columns and count times
scaled_timecount <- corr_exp_data %>%
select(sample_ID, concentration, count_time, individuals) %>% # keep only relevant columns
filter(count_time %in% c(5, 10, 20, 40, 80, 120)) # remove frozen and lab counts
# calculate total ants per time point + scaled column
scaled_timecount <- scaled_timecount %>%
group_by(sample_ID, count_time) %>%
mutate(total_ants_time = sum(individuals, na.rm = T)) %>% # count total of ants at each time point
ungroup() %>%
mutate(scaled_ants_time = case_when( # add scaled column
!is.na(individuals) & total_ants_time == 0 ~ 0, # avoid divisions by 0
is.na(individuals) ~ NA, # keep NA values
.default = individuals / total_ants_time)) # scaled value
rm(list = ls(pattern = "^tmp_"))
# create a data frame with only relevant columns for analysis
cut_env_data <- cbind(env_data[,1:2], env_data[,50], env_data[,3:7],
env_data[,9], env_data[,12], env_data[,14:47])
Both time and date data need to be converted into numerical values in order to run the statistical analysis.
For the time data, I filter the env_data Google Sheet and see that the earliest experiment start time is 08:10, and the latest is 18:36. I can turn the time of day data into numerical data by converting the time data into minutes since 8 AM.
# adding a new column to the table with minutes since 8 am
cut_env_data <- cut_env_data %>%
mutate(minutes_since_8am = (hour(start_time) * 60 + minute(start_time)) - (8 * 60)) %>%
relocate(minutes_since_8am, .after = end_time) # re-order columns
# View transformed time column
head(cut_env_data[, c("start_time", "minutes_since_8am")])
## start_time minutes_since_8am
## 1 09:30:00 90
## 2 09:56:00 116
## 3 10:58:00 178
## 4 09:50:00 110
## 5 10:20:00 140
## 6 10:35:00 155
For the date data, I can convert it into numerical values by finding the earliest experiment day and considering it as day 1 and assigning numbers to all subsequent days relative to the first experiment day. I am using this method instead of considering January 1st as day 1 because 2024, unlike 2023 and 2022, was a leap year so it has 1 more day in February.
# Create a new column that maps each date to a fixed year (e.g., 1999)
# This ensures that only the month and day are considered.
cut_env_data <- cut_env_data %>%
mutate(date_day_month = as.Date(paste0("1999-", format(date, "%m-%d")))) %>%
relocate(date_day_month, .before = start_time) # re-order columns
# Find the earliest day (month and day) across all experiments in the fixed-year space
tmp_first_experiment_fixed <- min(cut_env_data$date_day_month, na.rm = T)
# Create a new column with the relative day (with the earliest experiment as day 1)
# Create a year column
cut_env_data <- cut_env_data %>%
mutate(relative_day = as.numeric(difftime(date_day_month, tmp_first_experiment_fixed, units = "days")) + 1,
year = year(date)) %>%
relocate(relative_day, .before = start_time) %>%
relocate(year, .after = date_time) # re-order columns
# Check the results
head(cut_env_data[, c("date", "date_day_month", "relative_day")])
## date date_day_month relative_day
## 1 2022-05-13 1999-05-13 32
## 2 2022-05-20 1999-05-20 39
## 3 2022-05-23 1999-05-23 42
## 4 2022-05-20 1999-05-20 39
## 5 2022-05-25 1999-05-25 44
## 6 2022-05-20 1999-05-20 39
rm(list = ls(pattern = "^tmp_"))
In the Ant Picnic data sheets, the information regarding the presence or absence of rain during the experiment and the level of cloud cover were merged into a “weather” category, combining the 4 possibilities of “no clouds”, “lightly cloudy”, “very cloudy” and “rain”. For my analysis, I need to dissociate the presence/absence of rain (a binary variable) from the cloud cover level.
All experiments with rain were also very cloudy. Therefore, I can add a “rain” column and replace the “rain, very cloudy” entries with very cloudy in the newly renamed “clouds” column.
# view the levels of the weather column
levels(as.factor(cut_env_data$weather)) # all rainy exp were also very cloudy
## [1] "lightly cloudy" "no clouds" "rain, very cloudy"
## [4] "very cloudy"
# create new rain column and rename weather column
cut_env_data <- cut_env_data %>%
mutate(rain = if_else(weather == "rain, very cloudy", "rain", "no rain"),
weather = if_else(weather == "rain, very cloudy", "very cloudy", weather)) %>%
rename(clouds = weather) %>% # renaming the weather column
relocate(rain, .after = wind) # re-order columns
# check known experiments with rain
print(cut_env_data[cut_env_data$sample_ID %in% c("23-22001", "23-22003","23-22007", "24-11083", "24-11084", "24-11085", "24-11086"), c("clouds", "rain")])
## clouds rain
## 88 very cloudy rain
## 90 very cloudy rain
## 92 very cloudy rain
## 212 very cloudy rain
## 213 very cloudy rain
## 214 very cloudy rain
## 215 very cloudy rain
# check that number of experiments with rain = 7
nrow(na.omit(cut_env_data[cut_env_data$rain == "rain", c("sample_ID", "clouds", "rain")]))
## [1] 7
I also need to encode my categorical variables as numbers for later analysis. For rain, it is coded as 0 = no rain and 1 = rain. For clouds, wind, and shade, it is coded as 1 = none, 2 = intermediate (light wind/clouds, half-shade), 3 = very (or full for shade).
# check the levels for all categorical variables
levels(as.factor(cut_env_data$clouds))
## [1] "lightly cloudy" "no clouds" "very cloudy"
levels(as.factor(cut_env_data$wind))
## [1] "no wind" "strong wind" "weak wind"
levels(as.factor(cut_env_data$shade))
## [1] "full shade" "half-shade" "no shade"
levels(as.factor(cut_env_data$rain))
## [1] "no rain" "rain"
# recode the categorical columns
cut_env_data <- cut_env_data %>%
mutate(clouds_numeric = dplyr::recode(clouds,
"no clouds" = 1,
"lightly cloudy" = 2,
"very cloudy" = 3),
wind_numeric = dplyr::recode(wind,
"no wind" = 1,
"weak wind" = 2,
"strong wind" = 3),
shade_numeric = dplyr::recode(shade,
"no shade" = 1,
"half-shade" = 2,
"full shade" = 3),
rain_numeric = dplyr::recode(rain,
"no rain" = 0,
"rain" = 1)) %>%
relocate(c(clouds_numeric, wind_numeric, shade_numeric, rain_numeric), .after = rain) # re-order columns
To answer my research questions 1 (see below), I need a table which combines the scaled number of ants per species per experiment and the environmental data. I also ensure the variable formats are correct.
# add env data to scaled species per exp
env_analysis <- scaled_species_exp %>%
left_join(cut_env_data, by = c("sample_ID"))
# making sure all columns have the right formats
env_analysis <- env_analysis %>%
mutate(across(c("sample_ID", "sp_abbr", "climate_logger_ID", "city", "site"), as.factor),
clouds = factor(clouds, levels = c("no clouds", "lightly cloudy", "very cloudy")), # specify order of factor levels
shade = factor(shade, levels = c("no shade", "half-shade", "full shade")),
wind = factor(wind, levels = c("no wind", "weak wind", "strong wind")),
rain = factor(rain, levels = c("no rain", "rain")),
across(c("individuals", "species_max", "scaled_species_total", "year",
"relative_day", "minutes_since_8am", "latitude", "longitude",
"temperature"), as.numeric),
across(c(24:57), as.numeric)) # set all imp and climate logger variables to numeric
str(env_analysis)
## tibble [6,453 × 57] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : Factor w/ 239 levels "22-10000","22-11002",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sp_abbr : Factor w/ 27 levels "D. quadripunctatus",..: 12 13 21 6 22 27 3 15 26 16 ...
## $ individuals : num [1:6453] 0 0 0 0 0 0 0 0 0 0 ...
## $ species_max : num [1:6453] 306 2 6 11 47 502 2 15 27 18 ...
## $ scaled_species_total: num [1:6453] 0 0 0 0 0 0 0 0 0 0 ...
## $ date : Date[1:6453], format: "2022-05-13" "2022-05-13" ...
## $ date_time : POSIXct[1:6453], format: "2022-05-13 09:30:00" "2022-05-13 09:30:00" ...
## $ year : num [1:6453] 2022 2022 2022 2022 2022 ...
## $ date_day_month : Date[1:6453], format: "1999-05-13" "1999-05-13" ...
## $ relative_day : num [1:6453] 32 32 32 32 32 32 32 32 32 32 ...
## $ start_time : 'hms' num [1:6453] 09:30:00 09:30:00 09:30:00 09:30:00 ...
## ..- attr(*, "units")= chr "secs"
## $ end_time : 'hms' num [1:6453] 11:26:00 11:26:00 11:26:00 11:26:00 ...
## ..- attr(*, "units")= chr "secs"
## $ minutes_since_8am : num [1:6453] 90 90 90 90 90 90 90 90 90 90 ...
## $ climate_logger_ID : Factor w/ 101 levels "94226711","94226712",..: 75 75 75 75 75 75 75 75 75 75 ...
## $ latitude : num [1:6453] 51.5 51.5 51.5 51.5 51.5 ...
## $ longitude : num [1:6453] 12 12 12 12 12 ...
## $ city : Factor w/ 3 levels "Berlin","Halle",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ site : Factor w/ 39 levels "Anna-Essinger-Gemeinschaftsschule",..: 10 10 10 10 10 10 10 10 10 10 ...
## $ temperature : num [1:6453] 19 19 19 19 19 19 19 19 19 19 ...
## $ clouds : Factor w/ 3 levels "no clouds","lightly cloudy",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ shade : Factor w/ 3 levels "no shade","half-shade",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ wind : Factor w/ 3 levels "no wind","weak wind",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ rain : Factor w/ 2 levels "no rain","rain": 1 1 1 1 1 1 1 1 1 1 ...
## $ clouds_numeric : num [1:6453] 2 2 2 2 2 2 2 2 2 2 ...
## $ wind_numeric : num [1:6453] 3 3 3 3 3 3 3 3 3 3 ...
## $ shade_numeric : num [1:6453] 2 2 2 2 2 2 2 2 2 2 ...
## $ rain_numeric : num [1:6453] 0 0 0 0 0 0 0 0 0 0 ...
## $ imp_mean : num [1:6453] 37.4 37.4 37.4 37.4 37.4 ...
## $ imp_median : num [1:6453] 22 22 22 22 22 22 22 22 22 22 ...
## $ imp_stdev : num [1:6453] 40.5 40.5 40.5 40.5 40.5 ...
## $ imp_min : num [1:6453] 0 0 0 0 0 0 0 0 0 0 ...
## $ imp_max : num [1:6453] 100 100 100 100 100 100 100 100 100 100 ...
## $ t1_mean : num [1:6453] 18.4 18.4 18.4 18.4 18.4 ...
## $ t1_max : num [1:6453] 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 18.5 ...
## $ t1_min : num [1:6453] 18.2 18.2 18.2 18.2 18.2 ...
## $ t1_sd : num [1:6453] 0.107 0.107 0.107 0.107 0.107 ...
## $ t1_var : num [1:6453] 0.0113 0.0113 0.0113 0.0113 0.0113 ...
## $ t2_mean : num [1:6453] 18.9 18.9 18.9 18.9 18.9 ...
## $ t2_max : num [1:6453] 19.4 19.4 19.4 19.4 19.4 ...
## $ t2_min : num [1:6453] 18.4 18.4 18.4 18.4 18.4 ...
## $ t2_sd : num [1:6453] 0.387 0.387 0.387 0.387 0.387 ...
## $ t2_var : num [1:6453] 0.15 0.15 0.15 0.15 0.15 ...
## $ t3_mean : num [1:6453] 18.2 18.2 18.2 18.2 18.2 ...
## $ t3_max : num [1:6453] 18.8 18.8 18.8 18.8 18.8 ...
## $ t3_min : num [1:6453] 17.7 17.7 17.7 17.7 17.7 ...
## $ t3_sd : num [1:6453] 0.38 0.38 0.38 0.38 0.38 ...
## $ t3_var : num [1:6453] 0.145 0.145 0.145 0.145 0.145 ...
## $ raw_moist_mean : num [1:6453] 741 741 741 741 741 ...
## $ raw_moist_max : num [1:6453] 748 748 748 748 748 748 748 748 748 748 ...
## $ raw_moist_min : num [1:6453] 733 733 733 733 733 733 733 733 733 733 ...
## $ raw_moist_sd : num [1:6453] 5.06 5.06 5.06 5.06 5.06 ...
## $ raw_moist_var : num [1:6453] 25.6 25.6 25.6 25.6 25.6 ...
## $ vol_moist_mean : num [1:6453] 1.97 1.97 1.97 1.97 1.97 ...
## $ vol_moist_max : num [1:6453] 2.1 2.1 2.1 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
## $ vol_moist_min : num [1:6453] 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8 ...
## $ vol_moist_sd : num [1:6453] 0.111 0.111 0.111 0.111 0.111 ...
## $ vol_moist_var : num [1:6453] 0.0124 0.0124 0.0124 0.0124 0.0124 ...
To answer my research question 2 (see below), I need a table which combines the scaled number of ants per species per bait per experiment and the “site” variables which will be modeled as a random effect. I also add a “presence” column which records whether there was > 0 ants present on the baits and ensure the variable formats are correct.
# adding a site column for the random effect + a presence column
sugar_analysis <- scaled_species_bait %>%
left_join(env_data[,c("sample_ID", "site")], by = c("sample_ID")) %>% # add site column
mutate(presence = ifelse(individuals > 0, 1, 0)) %>% # add presence/absence column
relocate(presence, .before = site)
# making sure all columns have the right formats
sugar_analysis <- sugar_analysis %>%
mutate(across(c("sample_ID", "sp_abbr", "site"), as.factor),
across(c("individuals", "concentration", "species_total_exp", "scaled_species_total",
"presence"), as.numeric))
str(sugar_analysis)
## tibble [32,265 × 8] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : Factor w/ 239 levels "22-10000","22-11002",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ concentration : num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
## $ sp_abbr : Factor w/ 27 levels "D. quadripunctatus",..: 12 13 21 6 22 27 3 15 26 16 ...
## $ individuals : num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
## $ species_total_exp : num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
## $ scaled_species_total: num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
## $ presence : num [1:32265] 0 0 0 0 0 0 0 0 0 0 ...
## $ site : Factor w/ 39 levels "Anna-Essinger-Gemeinschaftsschule",..: 10 10 10 10 10 10 10 10 10 10 ...
# data frame for overall activity (not species-level)
sugar_analysis_overall <- scaled_totals_bait %>%
left_join(env_data[,c("sample_ID", "site")], by = c("sample_ID")) %>%
rename(individuals = total_ants_sp_bait) %>%
mutate(presence = ifelse(individuals > 0, 1, 0)) %>% # add presence/absence column
relocate(presence, .before = site)
# making sure all columns have the right formats
sugar_analysis_overall <- sugar_analysis_overall %>%
mutate(across(c("sample_ID", "site"), as.factor),
across(c("individuals", "concentration", "ant_total_exp", "scaled_ant_total",
"presence"), as.numeric))
str(sugar_analysis_overall)
## tibble [1,195 × 7] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : Factor w/ 239 levels "22-10000","22-11002",..: 1 1 1 1 1 2 2 2 2 2 ...
## $ concentration : num [1:1195] 0 5 10 20 40 0 5 10 20 40 ...
## $ individuals : num [1:1195] 0 0 0 0 0 0 1 2 0 37 ...
## $ ant_total_exp : num [1:1195] 0 0 0 0 0 40 40 40 40 40 ...
## $ scaled_ant_total: num [1:1195] 0 0 0 0 0 0 0.025 0.05 0 0.925 ...
## $ presence : num [1:1195] 0 0 0 0 0 0 1 1 0 1 ...
## $ site : Factor w/ 39 levels "Anna-Essinger-Gemeinschaftsschule",..: 10 10 10 10 10 37 37 37 37 37 ...
For research question 3, I need a table which combines the scaled number of ants per counting time point per experiment and the “site” variables which will be modeled as a random effect. I also add a “presence” column which records whether there was > 0 ants present on the baits and ensure the variable formats are correct.
# add site and presence column to scaled_timecount
speed_analysis <- scaled_timecount %>%
left_join(env_data[,c("sample_ID", "site")], by = c("sample_ID")) %>% # add site column
mutate(presence = ifelse(individuals > 0, 1, 0)) %>% # add presence/absence column
relocate(presence, .before = site)
# making sure all columns have the right formats
speed_analysis <- speed_analysis %>%
mutate(across(c("sample_ID", "concentration", "site"), as.factor),
across(c("count_time", "individuals", "total_ants_time", "scaled_ants_time",
"presence"), as.numeric))
str(speed_analysis)
## tibble [7,170 × 8] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : Factor w/ 239 levels "22-10000","22-11002",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ concentration : Factor w/ 5 levels "0","5","10","20",..: 1 1 1 1 1 1 2 2 2 2 ...
## $ count_time : num [1:7170] 5 10 20 40 80 120 5 10 20 40 ...
## $ individuals : num [1:7170] 0 0 0 0 1 0 0 0 0 0 ...
## $ total_ants_time : num [1:7170] 0 0 0 1 2 0 0 0 0 1 ...
## $ scaled_ants_time: num [1:7170] 0 0 0 0 0.5 0 0 0 0 0 ...
## $ presence : num [1:7170] 0 0 0 0 1 0 0 0 0 0 ...
## $ site : Factor w/ 39 levels "Anna-Essinger-Gemeinschaftsschule",..: 10 10 10 10 10 10 10 10 10 10 ...
Since the Ant Picnic experiments were conducted both by citizens and a scientist, it is interesting to compare the accuracy of some aspects of the data between citizen participants and scientists. Moreover, it is interesting to check how the participant-recorded measures such as temperature compare with the climate logger.
I start by comparing the microclimate logger temperatures (both ground temperature and aboveground temperature) with the participant-recorded temperatures (from a weather app).
# ground temperature vs weather app temperature, per year
(tmp_t2_temp_plot <- ggplot(na.omit(env_analysis), aes(x = temperature, y = t2_mean, color = as.factor(year)))+
geom_point()+
geom_smooth(method = lm, se = F)+
geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
scale_color_paletteer_d("nationalparkcolors::Acadia") + # Apply Acadia discrete color scale
labs(x = "Weather app temperature (\u00B0C)",
y = "Ground temperature (microclimate logger) (\u00B0C)", color = "Year") +
theme_classic2() +
theme(legend.text = element_text(size = 8), # Smaller legend text
legend.title = element_text(size = 12, hjust = 0.5, vjust = 2), # Smaller legend title
legend.key.size = unit(0.6, "cm"), # Makes the legend squares smaller
axis.title.x = element_text(margin = margin(t = 10)), # adjust distance from axis title to axis
axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
)
## `geom_smooth()` using formula = 'y ~ x'
# above ground temperature vs weather app temperature, per year
(tmp_t3_temp_plot <- ggplot(na.omit(env_analysis), aes(x = temperature, y = t3_mean, color = as.factor(year)))+
geom_point()+
geom_smooth(method = lm, se = F)+
geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
scale_color_paletteer_d("nationalparkcolors::Acadia") + # Apply Acadia discrete color scale
labs(x = "Weather app temperature (\u00B0C)",
y = "Aboveground temperature (microclimate logger) (\u00B0C)", color = "Year") +
theme_classic2() +
theme(legend.text = element_text(size = 8), # Smaller legend text
legend.title = element_text(size = 12, hjust = 0.5, vjust = 2), # Smaller legend title
legend.key.size = unit(0.6, "cm"), # Makes the legend squares smaller
axis.title.x = element_text(margin = margin(t = 10)), # adjust distance from axis title to axis
axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
)
## `geom_smooth()` using formula = 'y ~ x'
# ground temp vs weather app temperature, all years pooled
(tmp_t2_temp_all_plot <- ggplot(na.omit(env_analysis), aes(x = temperature, y = t2_mean))+
geom_point(color = "black")+
geom_smooth(method = lm, se = F, , color = "black")+
geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
labs(x = "Weather app temperature (\u00B0C)",
y = "Ground temperature (microclimate logger) (\u00B0C)") +
theme_classic2() +
theme(axis.title.x = element_text(margin = margin(t = 10)), # adjust distance from axis title to axis
axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
)
## `geom_smooth()` using formula = 'y ~ x'
# aboveground temp vs weather app temperature, all years pooled
(tmp_t3_temp_all_plot <- ggplot(na.omit(env_analysis), aes(x = temperature, y = t3_mean))+
geom_point(color = "black")+
geom_smooth(method = lm, se = F, , color = "black")+
geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
labs(x = "Weather app temperature (\u00B0C)",
y = "Aboveground temperature (microclimate logger) (\u00B0C)") +
theme_classic2() +
theme(axis.title.x = element_text(margin = margin(t = 10)), # adjust distance from axis title to axis
axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
)
## `geom_smooth()` using formula = 'y ~ x'
It appears that all temperature measures correlate fairly well, with ground temperatures usually being a bit warmer than weather app temperatures, especially at lower temperatures.
I can also compare the counting accuracy between citizens and a scientist, both at the end of the experiment while collecting the ants (120 minutes) and after freezing with the number of ants counted in the lab during species identification.
# add a citizen vs scientist column
tmp_count_accuracy <- exp_totals %>%
dplyr::select(1:3) %>%
left_join(sp_totals, by = "sample_ID") %>% # add lab totals from species data
filter(total_ants_sp > 0) %>% # only keep rows with ants
mutate(experimenter = case_when( # create new experimenter column
grepl("^22-|^23-", sample_ID) ~ "citizens", # if sample ID starts with 22 or 23, assign citizen
grepl("^24-", sample_ID) ~ "scientist", # if sample ID starts with 24, assign scientist
.default = NA_character_ # Assigns NA if none of the conditions match
))
# plot count accuracy after 120 minutes, citizens vs scientist
(tmp_count_acc_120_plot <- ggplot(na.omit(tmp_count_accuracy), aes(x = total_120, y = total_ants_sp, color = as.factor(experimenter)))+
geom_point()+
geom_smooth(method = lm, se = F)+
geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
scale_color_paletteer_d("nationalparkcolors::Acadia") + # Apply Acadia discrete color scale
scale_x_continuous(limits = c(0, max(tmp_count_accuracy$total_ants_sp, na.rm = T))) +
scale_y_continuous(limits = c(0, max(tmp_count_accuracy$total_ants_sp, na.rm = T))) +
labs(x = "Number of ants counted after 120 minutes",
y = "Number of ants counted in the lab", color = "Experimenter") +
theme_classic2() +
theme(legend.text = element_text(size = 8), # Smaller legend text
legend.title = element_text(size = 12, hjust = 0.5, vjust = 2), # Smaller legend title
legend.key.size = unit(0.6, "cm"), # Makes the legend squares smaller
axis.title.x = element_text(margin = margin(t = 10)), # adjust distance from axis title to axis
axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_smooth()`).
# plot count accuracy after freezing, citizens vs scientist
(tmp_count_acc_frozen_plot <- ggplot(na.omit(tmp_count_accuracy), aes(x = total_Frozen, y = total_ants_sp, color = as.factor(experimenter)))+
geom_point()+
geom_smooth(method = lm, se = F)+
geom_abline(linetype = 2, color = "red")+ # add 1:1 identity line in red and dashed
scale_color_paletteer_d("nationalparkcolors::Acadia") + # Apply Acadia discrete color scale
scale_x_continuous(limits = c(0, max(tmp_count_accuracy$total_ants_sp, na.rm = T))) +
scale_y_continuous(limits = c(0, max(tmp_count_accuracy$total_ants_sp, na.rm = T))) +
labs(x = "Number of ants counted after freezing",
y = "Number of ants counted in the lab", color = "Experimenter") +
theme_classic2() +
theme(legend.text = element_text(size = 8), # Smaller legend text
legend.title = element_text(size = 12, hjust = 0.5, vjust = 2), # Smaller legend title
legend.key.size = unit(0.6, "cm"), # Makes the legend squares smaller
axis.title.x = element_text(margin = margin(t = 10)), # adjust distance from axis title to axis
axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_smooth()`).
Overall, the graph after 120 minutes shows that while ant counts are relatively accurate when few ants are present, counts during the experiment with live ants running around become somewhat wild conjectures when ants are numerous, e.g. above 100 ants. Surprisingly, it appears that citizens were more accurate when counting the ants after 120 minutes than the scientist (me). However, this might be due to the fact that one sample (23-22069) had 159 ants estimated at 120 minutes but only 6 ants counted in the lab.
In my case, part of the inaccuracy comes from the fact that often, many ants were foraging on the baits from underneath the A6 bait card, through the wet card, and were thus invisible when counting before picking up the card. The way I proceeded was to count the visible ants, pick up the card and place it in the resealable bag, and write down whether ants were present under the bait card. When only a few ants were under it and I could clearly count them, I added them to the total, but when too many ants to be able to count were under the card, I only wrote it as a comment instead of making a wild guess as to how many ants were under the card. For example, the strong outlier with > 300 ants in the card is experiment 24-11108, where I counted 76 ants visible at 120 minutes and wrote “lots [of ants] under card” as a comment.
Since I cannot know how the different participants dealt with counts and whether they encountered this issue of ants under the cards at all, it is difficult to know exactly what the difference between me and the citizens is due to.
rm(list = ls(pattern = "^tmp_"))
As I have also identified the species of ants found in the different experiments, I can visualize the diversity of ant species found, whether overall, per year or per species. I plot both the occurence (number of experiments where a species occured) and the number of individuals of each species found.
I also plot the number of experiments done per year and per city.
# create table with relevant columns
tmp_sp_distr <- env_analysis %>%
dplyr::select(c(sample_ID, sp_abbr, individuals, date, year, city)) %>% # select relevant columns
filter(individuals > 0 & !is.na(individuals) & !is.na(city)) # only keep rows with ants
# distribution of experiments per year
(tmp_exp_distr_year_plot <- ggplot(na.omit(cut_env_data), aes(x = as.factor(year), fill = as.factor(year))) +
geom_bar(position = "dodge") +
scale_fill_paletteer_d("nationalparkcolors::Acadia") + # Apply Acadia discrete color scale
labs(y = "Number of experiments") +
theme_bw() +
theme(legend.position = "none", # set legend text
axis.title.x = element_blank(), # adjust distance from axis title to axis, set text size
axis.title.y = element_text(margin = margin(r = 10), size = 12)) # adjust distance from axis title to axis, center title
)
# distribution of experiments per city
(tmp_exp_distr_city_plot <- ggplot(na.omit(cut_env_data), aes(x = as.factor(city), fill = as.factor(city))) +
geom_bar(position = "dodge") +
scale_fill_paletteer_d("nationalparkcolors::Acadia") + # Apply Acadia discrete color scale
labs(x = "City",
y = "Number of experiments") +
theme_bw() +
theme(legend.position = "none", # set legend text
axis.title.x = element_blank(), # adjust distance from axis title to axis, set text size
axis.title.y = element_text(margin = margin(r = 10), size = 12)) # adjust distance from axis title to axis, center title
)
# distribution of experiments per year and per city
(tmp_exp_distr_year_city_plot <- ggplot(na.omit(cut_env_data), aes(x = as.factor(year), fill = as.factor(city))) +
geom_bar(position = "dodge") +
scale_fill_paletteer_d("nationalparkcolors::Acadia") + # Apply Acadia discrete color scale
labs(x = "Year", y = "Number of experiments", fill = "City") +
theme_bw() +
theme(legend.text = element_text(size = 10), # set legend text
legend.title = element_text(size = 12, hjust = 0.5, vjust = 2), # Smaller legend title
legend.key.size = unit(0.5, "cm"), # Makes the legend squares smaller
axis.title.x = element_blank(), # adjust distance from axis title to axis, set text size
axis.title.y = element_text(margin = margin(r = 10), size = 12)) # adjust distance from axis title to axis, center title
)
# distribution of species per year
(tmp_sp_distr_year_plot <- ggplot(tmp_sp_distr, aes(x = as.factor(year), fill = as.factor(sp_abbr))) +
geom_bar(position = "dodge") +
scale_fill_viridis_d() + # Apply Acadia discrete color scale
labs(x = "Number of species occuring each year",
y = "Number of experiments", fill = "Species") +
theme_bw() +
theme(legend.text = element_text(size = 10, face = "italic"), # set legend text
legend.title = element_text(size = 12, hjust = 0.5, vjust = 2), # Smaller legend title
legend.key.size = unit(0.5, "cm"), # Makes the legend squares smaller
axis.title.x = element_text(margin = margin(t = 10), size = 12), # adjust distance from axis title to axis, set text size
axis.title.y = element_text(margin = margin(r = 10), size = 12))+ # adjust distance from axis title to axis, center title
guides(fill = guide_legend(ncol = 1)) # Set legend to 1 column
)
# distribution of species per city
(tmp_sp_distr_city_plot <- ggplot(tmp_sp_distr, aes(x = as.factor(city), fill = as.factor(sp_abbr))) +
geom_bar(position = "dodge") +
scale_fill_viridis_d() + # Apply Acadia discrete color scale
labs(x = "Number of species occuring in each city",
y = "Number of experiments", fill = "Species") +
theme_bw() +
theme(legend.text = element_text(size = 10, face = "italic"), # set legend text
legend.title = element_text(size = 12, hjust = 0.5, vjust = 2), # Smaller legend title
legend.key.size = unit(0.5, "cm"), # Makes the legend squares smaller
axis.title.x = element_text(margin = margin(t = 10), size = 12), # adjust distance from axis title to axis, set text size
axis.title.y = element_text(margin = margin(r = 10), size = 12))+ # adjust distance from axis title to axis, center title
guides(fill = guide_legend(ncol = 1)) # Set legend to 1 column
)
# overall species occurrence
(tmp_species_occ_plot <- ggplot(tmp_sp_distr, aes(x=sp_abbr, fill = sp_abbr))+
geom_bar()+
scale_fill_viridis_d()+
labs(y = "Number of experiments") +
theme_classic()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 12, face = "italic"),
legend.position = "none",
axis.title.x = element_blank(), # adjust distance from axis title to axis
axis.title.y = element_text(size = 12, margin = margin(r = 15), hjust = 0.5)) # adjust distance from axis title to axis, center title
)
# number of ants per species
(tmp_species_ind_plot <- ggplot(tmp_sp_distr, aes(x=sp_abbr, y=individuals, fill = sp_abbr))+
geom_col()+
scale_fill_viridis_d()+
labs(y = "Number of ant individuals") +
theme_classic()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 12, face = "italic"),
legend.position = "none",
axis.title.x = element_blank(), # adjust distance from axis title to axis
axis.title.y = element_text(size = 12, margin = margin(r = 15), hjust = 0.5)) # adjust distance from axis title to axis, center title
)
(tmp_species_grouped_plot <- tmp_species_ind_plot + tmp_species_ind_plot +
plot_layout(nrow = 2) + plot_annotation(tag_levels = 'a', tag_suffix = ")"))
Since I visited the 9 sites of my 2024 fieldwork 3 times over the span of 3 months, we can also plot the species diversity by month and by site for the 2024 experiments.
# create table with relevant columns and rows
tmp_sp_distr_24 <- env_analysis %>%
dplyr::select(c(sample_ID, sp_abbr, individuals, date_day_month, year, site)) %>% # select relevant columns
filter(individuals > 0 & !is.na(individuals) & year == 2024 & month(date_day_month) != 4) %>% # only keep rows with ants and 2024 data
mutate(month = case_when( # create new month column
(month(date_day_month) == 5 | as.character(date_day_month) == "1999-06-04") ~ "May",
(as.character(date_day_month) != "1999-06-04" & month(date_day_month) == 6) ~ "June",
month(date_day_month) > 6 ~ "July",
.default = NULL)) %>%
mutate(month = factor(month, levels = c("May", "June", "July"))) # Convert month to factor with ordered levels
# Create the plot per month
(tmp_map_sp24_month <- ggplot(tmp_sp_distr_24, aes(x = month, fill = sp_abbr)) +
geom_bar(position = "dodge") +
facet_wrap(~ site) +
scale_fill_viridis_d(option = "D") + # Apply Viridis discrete color scale
labs(x = "Species occurrence per month", y = "Number of experiments per site", fill = "Species") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10), # Rotate x-axis labels
legend.text = element_text(size = 10, face = "italic"), # Smaller legend text
legend.title = element_text(size = 12, hjust = 0.5, vjust = 2), # Smaller legend title
legend.key.size = unit(0.6, "cm"), # Makes the legend squares smaller
strip.text = element_text(size = 10), # Smaller facet labels (location names)
axis.title.x = element_text(margin = margin(t = 10), size = 12), # adjust distance from axis title to axis
axis.title.y = element_text(margin = margin(r = 10), size = 12))+ # adjust distance from axis title to axis, center title
guides(fill = guide_legend(ncol = 1)) # Set legend to 1 column
)
rm(list = ls(pattern = "^tmp_"))
The overarching research question is: is ant foraging activity influenced by environmental conditions and land use ?
More specifically, does the average number of ants collected at the end of the baiting experiments depend on ground temperature, soil moisture, reported weather conditions, time of day, date of experiment, mean impervious surface coverage in a 100-meter radius around the experiment sites, and the interaction of these factors ?
Description of tested variables
I expect several of my explanatory variables, such as the time of day, weather, date, temperature to be correlated. Additionally, I have several measures for temperature. This correlation analysis allows to get a clearer idea of how the variables correlate and of which variables are the most relevant for my analysis.
I perform two correlation analyses, the first focusing only on the quantitative variables (excluding rain, clouds, shade, wind, city, and year) and the second including all variables. For the first analysis, I want to use the Pearson correlation coefficient, which assumes a linear relationship and that variables are normally distributed with no extreme outliers.
I start by testing the normality of the distribution of my variables as well as checking visually for outliers. From the Q-Q plots, it seems that the variables from the microclimate loggers and the temperature recorded by participants are relatively close to a normal distribution, although with some deviation at extreme values. the impervious surface, experiment start time, and date of experiment show clear deviation from normality. The results of the Shapiro-Wilk test confirm that all variables significantly differ from a normal distribution.
Since my continuous variables are not normally distributed, both correlation analyses (only quantitative variables, both quantitative and qualitative variables) are run using Spearman’s rank correlation coefficient.
#create a data frame with only the numerical variables
tmp_cor_plot_env_num <- cut_env_data[,c("t1_mean", "t2_mean", "t3_mean", "temperature", "imp_mean",
"vol_moist_mean", "minutes_since_8am",
"relative_day")]
# remove NAs from data set
tmp_cor_plot_env_num <- na.omit(tmp_cor_plot_env_num)
#rename variables for plotting
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "t1_mean"] <- "Temperature below ground"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "t2_mean"] <- "Temperature at ground surface"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "t3_mean"] <- "Temperature above ground"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "temperature"] <- "Temperature (participants)"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "imp_mean"] <- "Impervious surface coverage"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "vol_moist_mean"] <- "Soil moisture"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "minutes_since_8am"] <- "Experiment start time"
names(tmp_cor_plot_env_num)[names(tmp_cor_plot_env_num) == "relative_day"] <- "Date of experiment"
# Checking for normality of variable distributions
# create empty list
tmp_qq_plots <- list()
# plot Q-Q plots for each variable
for (i in 1:ncol(tmp_cor_plot_env_num)) {
tmp_qq_plots[[i]] <- ggplot(tmp_cor_plot_env_num, aes(sample = .data[[colnames(tmp_cor_plot_env_num)[i]]])) +
stat_qq()+
stat_qq_line()+
ggtitle(str_wrap(colnames(tmp_cor_plot_env_num)[i], width = 20))+
theme(plot.title = element_text(size = 10))
}
# Arrange the plots in a 2-row, 4-column grid
(tmp_env_qqplot <- wrap_plots(tmp_qq_plots) + plot_layout(nrow = 2, ncol = 4))
# saving the plot
#ggsave(
# filename = "../images/data_exploration/env_var_qqplots.png",
# plot = tmp_env_qqplot,
# dpi = 300,
# height = 12,
# width = 20,
# units = "cm"
# )
# Shapiro-wilk test for normality
# Create an empty data frame to store results
tmp_shapiro_results <- data.frame(Variable = character(), P_Value = numeric(), stringsAsFactors = FALSE)
# Loop through each column in cor_plot_env_num
for (tmp_i in 1:ncol(tmp_cor_plot_env_num)) {
tmp_col_name <- colnames(tmp_cor_plot_env_num)[tmp_i] # Get the column name
tmp_test_result <- shapiro.test(tmp_cor_plot_env_num[[tmp_i]]) # Perform Shapiro-Wilk test
# Store results in the data frame
tmp_shapiro_results <- rbind(tmp_shapiro_results, data.frame(Variable = tmp_col_name, P_Value = tmp_test_result$p.value))
}
# Print the variables not significantly
print(tmp_shapiro_results)
## Variable P_Value
## 1 Temperature below ground 1.032684e-02
## 2 Temperature at ground surface 2.280288e-03
## 3 Temperature above ground 1.607317e-03
## 4 Temperature (participants) 9.123991e-03
## 5 Impervious surface coverage 2.093473e-09
## 6 Soil moisture 3.531498e-03
## 7 Experiment start time 2.765104e-10
## 8 Date of experiment 3.630909e-11
# compute the correlation matrix with p-values
tmp_env_cor_results <- rcorr(as.matrix(tmp_cor_plot_env_num), type = "spearman")
# extract correlation matrix and print
tmp_env_cor_matrix <- tmp_env_cor_results$r; tmp_env_cor_matrix
## Temperature below ground
## Temperature below ground 1.00000000
## Temperature at ground surface 0.83419975
## Temperature above ground 0.77569343
## Temperature (participants) 0.56721655
## Impervious surface coverage -0.07436956
## Soil moisture 0.11123787
## Experiment start time 0.42773839
## Date of experiment 0.48361254
## Temperature at ground surface
## Temperature below ground 0.8341997
## Temperature at ground surface 1.0000000
## Temperature above ground 0.9696263
## Temperature (participants) 0.6750450
## Impervious surface coverage -0.1012513
## Soil moisture 0.2087843
## Experiment start time 0.3006518
## Date of experiment 0.5032053
## Temperature above ground
## Temperature below ground 0.77569343
## Temperature at ground surface 0.96962631
## Temperature above ground 1.00000000
## Temperature (participants) 0.76636942
## Impervious surface coverage -0.08311249
## Soil moisture 0.28476850
## Experiment start time 0.33728020
## Date of experiment 0.57436184
## Temperature (participants)
## Temperature below ground 0.56721655
## Temperature at ground surface 0.67504503
## Temperature above ground 0.76636942
## Temperature (participants) 1.00000000
## Impervious surface coverage 0.03619624
## Soil moisture 0.24254308
## Experiment start time 0.48148038
## Date of experiment 0.48527967
## Impervious surface coverage Soil moisture
## Temperature below ground -0.07436956 0.1112379
## Temperature at ground surface -0.10125127 0.2087843
## Temperature above ground -0.08311249 0.2847685
## Temperature (participants) 0.03619624 0.2425431
## Impervious surface coverage 1.00000000 -0.1498928
## Soil moisture -0.14989285 1.0000000
## Experiment start time -0.06093111 0.2555638
## Date of experiment -0.10983860 0.3984425
## Experiment start time Date of experiment
## Temperature below ground 0.42773839 0.4836125
## Temperature at ground surface 0.30065183 0.5032053
## Temperature above ground 0.33728020 0.5743618
## Temperature (participants) 0.48148038 0.4852797
## Impervious surface coverage -0.06093111 -0.1098386
## Soil moisture 0.25556384 0.3984425
## Experiment start time 1.00000000 0.2355542
## Date of experiment 0.23555420 1.0000000
# extract the p-values
tmp_env_p_matrix <- tmp_env_cor_results$P
# plot correlation matrix (only insignificant ones hidden)
tmp_corr_num <- corrplot(tmp_env_cor_matrix,
type = "upper",
method = "color",
diag = F, # remove diagonal
#addCoef.col = "black", # add coefficient coeffs, not used
p.mat = tmp_env_p_matrix, # add p-values matrix
sig.level = c(0.001, 0.01, 0.05), # significance thresholds
tl.col = "black", # color of variable labels
cl.align.text = "l", # alignment of color legend
cl.offset = 0.3, # offset color legend text to the right
number.cex = 0.6, # size of coefficient text
tl.cex = 0.7, # size of variable labels
insig = "label_sig", # add stars according to significance thresholds
pch.cex = 1) # size of stars
# save the plot as a png, commented out for knitting of the rmarkdown file
# dev.copy(png, "../images/data_exploration/correlation_plot_num.png", width = 1600, height = 1600, res = 300)
# dev.off()
#create a data frame with only the numerical variables
tmp_cor_plot_env_all <- cut_env_data[,c("t1_mean", "t2_mean", "t3_mean", "temperature",
"imp_mean", "vol_moist_mean", "minutes_since_8am",
"relative_day", "clouds_numeric", "shade_numeric",
"wind_numeric", "rain_numeric", "year", "city")]
tmp_cor_plot_env_all <- tmp_cor_plot_env_all %>%
# Code the city variable as numbers
mutate(city = dplyr::recode(city, "Halle" = 1, "Leipzig" = 2, "Berlin" = 3)) %>%
# Remove NAs
na.omit() %>%
# Rename variables for plotting
rename(
"Temperature below ground" = t1_mean,
"Temperature at ground surface" = t2_mean,
"Temperature above ground" = t3_mean,
"Temperature (participants)" = temperature,
"Impervious surface coverage" = imp_mean,
"Soil moisture" = vol_moist_mean,
"Experiment start time" = minutes_since_8am,
"Date of experiment" = relative_day,
"Year" = year,
"Cloud cover (participants)" = clouds_numeric,
"Shade (participants)" = shade_numeric,
"Wind (participants)" = wind_numeric,
"Rain (participants)" = rain_numeric,
"City" = city)
# compute the correlation matrix with p-values
tmp_env_cor_all_results <- rcorr(as.matrix(tmp_cor_plot_env_all), type = "spearman")
# extract correlation matrix and print results
tmp_env_cor_all_matrix <- tmp_env_cor_all_results$r; tmp_env_cor_all_matrix
## Temperature below ground
## Temperature below ground 1.00000000
## Temperature at ground surface 0.83328957
## Temperature above ground 0.77561643
## Temperature (participants) 0.56644092
## Impervious surface coverage -0.06592400
## Soil moisture 0.11655318
## Experiment start time 0.42729222
## Date of experiment 0.48418019
## Cloud cover (participants) -0.08455477
## Shade (participants) -0.45324889
## Wind (participants) 0.05391416
## Rain (participants) 0.04470452
## Year 0.36997778
## City -0.22694091
## Temperature at ground surface
## Temperature below ground 0.83328957
## Temperature at ground surface 1.00000000
## Temperature above ground 0.97001775
## Temperature (participants) 0.67232219
## Impervious surface coverage -0.08762115
## Soil moisture 0.21644768
## Experiment start time 0.29861318
## Date of experiment 0.50060143
## Cloud cover (participants) -0.31001774
## Shade (participants) -0.41983030
## Wind (participants) -0.06590040
## Rain (participants) -0.07483132
## Year 0.37967491
## City -0.15631718
## Temperature above ground
## Temperature below ground 0.77561643
## Temperature at ground surface 0.97001775
## Temperature above ground 1.00000000
## Temperature (participants) 0.76276989
## Impervious surface coverage -0.06606175
## Soil moisture 0.29242172
## Experiment start time 0.33550292
## Date of experiment 0.57260749
## Cloud cover (participants) -0.31793377
## Shade (participants) -0.32468873
## Wind (participants) -0.09947974
## Rain (participants) -0.09013773
## Year 0.39127259
## City -0.12935776
## Temperature (participants)
## Temperature below ground 0.56644092
## Temperature at ground surface 0.67232219
## Temperature above ground 0.76276989
## Temperature (participants) 1.00000000
## Impervious surface coverage 0.05441409
## Soil moisture 0.25264427
## Experiment start time 0.48117322
## Date of experiment 0.47807917
## Cloud cover (participants) -0.20012325
## Shade (participants) 0.03623059
## Wind (participants) -0.13054458
## Rain (participants) -0.09839737
## Year 0.19799213
## City -0.09189725
## Impervious surface coverage Soil moisture
## Temperature below ground -0.06592400 0.11655318
## Temperature at ground surface -0.08762115 0.21644768
## Temperature above ground -0.06606175 0.29242172
## Temperature (participants) 0.05441409 0.25264427
## Impervious surface coverage 1.00000000 -0.14347183
## Soil moisture -0.14347183 1.00000000
## Experiment start time -0.05024439 0.25020451
## Date of experiment -0.10377918 0.39844625
## Cloud cover (participants) 0.06855137 0.03551604
## Shade (participants) 0.12346464 -0.02065546
## Wind (participants) -0.03088054 -0.07286811
## Rain (participants) 0.04835307 -0.13702879
## Year -0.24167275 0.49475040
## City 0.23016342 -0.18950715
## Experiment start time Date of experiment
## Temperature below ground 0.42729222 0.4841802
## Temperature at ground surface 0.29861318 0.5006014
## Temperature above ground 0.33550292 0.5726075
## Temperature (participants) 0.48117322 0.4780792
## Impervious surface coverage -0.05024439 -0.1037792
## Soil moisture 0.25020451 0.3984462
## Experiment start time 1.00000000 0.2283132
## Date of experiment 0.22831317 1.0000000
## Cloud cover (participants) 0.21900567 -0.1270443
## Shade (participants) -0.06504575 -0.1368369
## Wind (participants) 0.11053493 -0.1000801
## Rain (participants) -0.08333678 0.1544130
## Year 0.30444049 0.6727533
## City -0.17880359 -0.2211489
## Cloud cover (participants) Shade (participants)
## Temperature below ground -0.084554774 -0.453248893
## Temperature at ground surface -0.310017741 -0.419830298
## Temperature above ground -0.317933767 -0.324688725
## Temperature (participants) -0.200123247 0.036230586
## Impervious surface coverage 0.068551366 0.123464644
## Soil moisture 0.035516039 -0.020655462
## Experiment start time 0.219005670 -0.065045753
## Date of experiment -0.127044301 -0.136836908
## Cloud cover (participants) 1.000000000 0.009229617
## Shade (participants) 0.009229617 1.000000000
## Wind (participants) 0.184981936 -0.102597542
## Rain (participants) 0.274239271 -0.037687663
## Year -0.018226670 -0.241978620
## City -0.152549162 0.106695073
## Wind (participants) Rain (participants)
## Temperature below ground 0.05391416 0.04470452
## Temperature at ground surface -0.06590040 -0.07483132
## Temperature above ground -0.09947974 -0.09013773
## Temperature (participants) -0.13054458 -0.09839737
## Impervious surface coverage -0.03088054 0.04835307
## Soil moisture -0.07286811 -0.13702879
## Experiment start time 0.11053493 -0.08333678
## Date of experiment -0.10008007 0.15441302
## Cloud cover (participants) 0.18498194 0.27423927
## Shade (participants) -0.10259754 -0.03768766
## Wind (participants) 1.00000000 0.03047381
## Rain (participants) 0.03047381 1.00000000
## Year 0.05421455 0.05657501
## City -0.15087292 -0.19823003
## Year City
## Temperature below ground 0.36997778 -0.22694091
## Temperature at ground surface 0.37967491 -0.15631718
## Temperature above ground 0.39127259 -0.12935776
## Temperature (participants) 0.19799213 -0.09189725
## Impervious surface coverage -0.24167275 0.23016342
## Soil moisture 0.49475040 -0.18950715
## Experiment start time 0.30444049 -0.17880359
## Date of experiment 0.67275329 -0.22114893
## Cloud cover (participants) -0.01822667 -0.15254916
## Shade (participants) -0.24197862 0.10669507
## Wind (participants) 0.05421455 -0.15087292
## Rain (participants) 0.05657501 -0.19823003
## Year 1.00000000 -0.42821268
## City -0.42821268 1.00000000
# extract the p-values
tmp_env_p_all_matrix <- tmp_env_cor_all_results$P
# plot correlation matrix (only insignificant ones hidden)
corrplot(tmp_env_cor_all_matrix,
type = "upper",
method = "color",
diag = F, # remove diagonal
#addCoef.col = "black", # add coefficient coeffs, not used
p.mat = tmp_env_p_all_matrix, # add p-values matrix
sig.level = c(0.001, 0.01, 0.05), # significance thresholds
tl.col = "black", # color of variable labels
cl.align.text = "l", # alignment of color legend
cl.offset = 0.3, # offset color legend text to the right
number.cex = 0.6, # size of coefficient text
tl.cex = 0.7, # size of variable labels
insig = "label_sig", # add start according to significance thresholds
pch.cex = 1) # size of stars
# save the plot as a png, commented out for knitting of the rmarkdown file
# dev.copy(png, "../images/data_exploration/correlation_plot_all.png", width = 1600, height = 1600, res = 300)
# dev.off()
All temperature measures are highly correlated (r > 0.5), which is a good control and suggests that there are no significant discrepancies in the temperature measures since they align well. For analysis, the ground temperature will be used as the sole temperature variable as it is the measure that should best reflect the temperature experienced by foraging ants.
#create a data frame with only the numerical variables
tmp_cor_plot_env_t2 <- cut_env_data[,c("t2_mean", "imp_mean", "vol_moist_mean", "minutes_since_8am",
"relative_day", "clouds_numeric", "shade_numeric",
"wind_numeric", "rain_numeric",
"year", "city")]
tmp_cor_plot_env_t2 <- tmp_cor_plot_env_t2 %>%
# Code the city variable as numbers
mutate(city = dplyr::recode(city, "Halle" = 1, "Leipzig" = 2, "Berlin" = 3)) %>%
# Remove NAs
na.omit() %>%
# Rename variables for plotting
rename(
"Temperature at ground surface" = t2_mean,
"Impervious surface coverage" = imp_mean,
"Soil moisture" = vol_moist_mean,
"Experiment start time" = minutes_since_8am,
"Date of experiment" = relative_day,
"Year" = year,
"Cloud cover (participants)" = clouds_numeric,
"Shade (participants)" = shade_numeric,
"Wind (participants)" = wind_numeric,
"Rain (participants)" = rain_numeric,
"City" = city)
# compute the correlation matrix with p-values
tmp_env_cor_t2_results <- rcorr(as.matrix(tmp_cor_plot_env_t2), type = "spearman")
# extract correlation matrix and print
tmp_env_cor_t2_matrix <- tmp_env_cor_t2_results$r; tmp_env_cor_t2_matrix
## Temperature at ground surface
## Temperature at ground surface 1.00000000
## Impervious surface coverage -0.07394311
## Soil moisture 0.22230648
## Experiment start time 0.31547805
## Date of experiment 0.48487924
## Cloud cover (participants) -0.29627708
## Shade (participants) -0.42511345
## Wind (participants) -0.05030683
## Rain (participants) -0.07351319
## Year 0.37590084
## City -0.16435596
## Impervious surface coverage Soil moisture
## Temperature at ground surface -0.07394311 0.22230648
## Impervious surface coverage 1.00000000 -0.15129769
## Soil moisture -0.15129769 1.00000000
## Experiment start time -0.04392663 0.25843734
## Date of experiment -0.11530860 0.41139718
## Cloud cover (participants) 0.06500863 0.04021584
## Shade (participants) 0.12149443 -0.03390096
## Wind (participants) -0.02918913 -0.06670234
## Rain (participants) 0.04500706 -0.13194680
## Year -0.24325424 0.50272161
## City 0.23700457 -0.20879219
## Experiment start time Date of experiment
## Temperature at ground surface 0.31547805 0.48487924
## Impervious surface coverage -0.04392663 -0.11530860
## Soil moisture 0.25843734 0.41139718
## Experiment start time 1.00000000 0.22775770
## Date of experiment 0.22775770 1.00000000
## Cloud cover (participants) 0.22448854 -0.12096476
## Shade (participants) -0.08029459 -0.14166884
## Wind (participants) 0.11922796 -0.09291709
## Rain (participants) -0.08081920 0.15634744
## Year 0.30949044 0.67584872
## City -0.19153845 -0.23546431
## Cloud cover (participants) Shade (participants)
## Temperature at ground surface -0.296277077 -0.425113446
## Impervious surface coverage 0.065008632 0.121494435
## Soil moisture 0.040215842 -0.033900960
## Experiment start time 0.224488536 -0.080294586
## Date of experiment -0.120964762 -0.141668836
## Cloud cover (participants) 1.000000000 0.007338021
## Shade (participants) 0.007338021 1.000000000
## Wind (participants) 0.179994439 -0.112799893
## Rain (participants) 0.273546537 -0.038069826
## Year -0.016298624 -0.254472564
## City -0.157434988 0.122652798
## Wind (participants) Rain (participants)
## Temperature at ground surface -0.05030683 -0.07351319
## Impervious surface coverage -0.02918913 0.04500706
## Soil moisture -0.06670234 -0.13194680
## Experiment start time 0.11922796 -0.08081920
## Date of experiment -0.09291709 0.15634744
## Cloud cover (participants) 0.17999444 0.27354654
## Shade (participants) -0.11279989 -0.03806983
## Wind (participants) 1.00000000 0.03043765
## Rain (participants) 0.03043765 1.00000000
## Year 0.06086049 0.05864652
## City -0.15213195 -0.19803643
## Year City
## Temperature at ground surface 0.37590084 -0.1643560
## Impervious surface coverage -0.24325424 0.2370046
## Soil moisture 0.50272161 -0.2087922
## Experiment start time 0.30949044 -0.1915384
## Date of experiment 0.67584872 -0.2354643
## Cloud cover (participants) -0.01629862 -0.1574350
## Shade (participants) -0.25447256 0.1226528
## Wind (participants) 0.06086049 -0.1521320
## Rain (participants) 0.05864652 -0.1980364
## Year 1.00000000 -0.4438117
## City -0.44381173 1.0000000
# extract the p-values
tmp_env_p_t2_matrix <- tmp_env_cor_t2_results$P
# plot correlation matrix (only insignificant ones hidden)
corrplot(tmp_env_cor_t2_matrix,
type = "upper",
method = "color",
diag = F, # remove diagonal
#addCoef.col = "black", # add coefficient coeffs, not used
p.mat = tmp_env_p_t2_matrix, # add p-values matrix
sig.level = c(0.001, 0.01, 0.05), # significance thresholds
tl.col = "black", # color of variable labels
cl.align.text = "l", # alignment of color legend
cl.offset = 0.3, # offset color legend text to the right
number.cex = 0.6, # size of coefficient text
tl.cex = 0.8, # size of variable labels
cl.cex = 0.7, # size of color legend text
insig = "label_sig", # add start according to significance thresholds
pch.cex = 1) # size of stars
# save the plot as a png, commented out for knitting of rmarkdown file
# dev.copy(png, "../images/data_exploration/correlation_plot_T2.png", width = 1600, height = 1600, res = 300)
# dev.off()
The results of the correlation analysis show:
Year and date of experiment are strongly correlated (r = 0.68), and year and city are moderately correlated (r = -0.44). This is expected as the 2024 experiments were only conducted in Leipzig and over a broader range of dates, while the experiments in 2022 and 2023 were conducted in all 3 cities and usually earlier in the year. Year is also moderately correlated to experiment start time (r = 0.31), soil moisture (r = 0.50), and ground temperature (r = 0.38), which may be due to the fact that experiments in 2024 were more spread out over the time of day and the date than in the previous year. This means the year column may reflect trends in seasonal and daily temperature and moisture variations.
Shade and temperature are moderately correlated (r = -0.43), which makes sense since shade lowers ground temperature. The shade variable could be an indirect indication of habitat structure around the baits (i.e., more or less trees around the experiment) or other microclimate characteristics, but it may also provide information which is mostly redundant with temperature. Models with and without shade as a fixed effect are compared below to see if keeping shade as a predictor improves the model or not.
Date of experiment is moderately correlated with temperature (r = 0.48) and soil moisture (r = 0.41), which likely reflects seasonal variations in temperature and weather.
Experiment start time is moderately correlated with temperature (r = 0.32), which reflects daily variations in temperature.
Cloud cover is moderately correlated with temperature (r = -0.30), which is to be expected as more cloud cover can have a negative impact on ground temperature.
Other variables are either weakly (r < 0.3) or non-significantly correlated.
A Variance Inflation Factor test can also be run on the variables (city and year removed since they will not be used as fixed effects) and shows that all variables have a VIF < 5, suggesting no significant multicollinearity.
tmp_vif_model <- lm(scaled_species_total ~ t2_mean + imp_mean + vol_moist_mean +
minutes_since_8am + relative_day +
clouds_numeric + wind_numeric +
shade_numeric + rain_numeric,
data = env_analysis)
vif(tmp_vif_model) # Run VIF test
## t2_mean imp_mean vol_moist_mean minutes_since_8am
## 1.961449 1.061534 1.264533 1.286789
## relative_day clouds_numeric wind_numeric shade_numeric
## 1.689057 1.445597 1.137733 1.293400
## rain_numeric
## 1.286647
rm(list = ls(pattern = "^tmp_"))
Based on the correlation analysis, I can define which variables are used as predictors (fixed effects) or random effects.
While rain would be an interesting predictor to measure, I am not going to include the absence or presence of rain as a predictor variable because its distribution is very unbalanced in my data: out of 239 experiments, only 7 experiments had rain while 229 did not.
table(cut_env_data$rain)
##
## no rain rain
## 229 7
All other predictor variables than rain will be added into the initial model, but several variable combinations will be tested to find the best model fit.
This section details different visualizations of the data.
# distribution of raw ant totals
ggplot(na.omit(env_analysis), aes(x = individuals)) +
geom_histogram(bins = 15, fill = "skyblue", color = "black") +
labs(x = "Raw number of ants per species and experiment", y = "Frequency")+
theme(plot.title = element_text(hjust = 0.5))+
theme_bw()
# distribution of scaled ant totals
ggplot(na.omit(env_analysis), aes(x = scaled_species_total))+
geom_histogram(bins = 15, fill = "skyblue", color = "black") +
labs(x = "Scaled number of ants per species and experiment", y = "Frequency")+
theme(plot.title = element_text(hjust = 0.5))+
theme_bw()
I can visualize the distributions of continuous explanatory variables as well as the relationship between the response and each variable.
# histogram of continuous predictors
na.omit(env_analysis) %>%
gather(key = "variable", value = "value", t2_mean, vol_moist_mean, imp_mean) %>%
ggplot(aes(x = value)) +
geom_histogram(fill = "skyblue", color = "black", bins = 20) +
facet_wrap(~ variable, scales = "free", labeller = labeller(variable = c(
t2_mean = "Ground Temperature (\u00B0C)",
vol_moist_mean = "Soil Moisture (%)",
imp_mean = "Impervious Surface Cover (%)"))) +
labs(title = "Distributions of Continuous Predictors")+
theme_bw()
# raw ants vs. ground temperature
tmp_raw_ant_t2_plot <- ggplot(na.omit(env_analysis), aes(x = t2_mean, y = individuals)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "lightblue") +
labs(title = "Raw ant count per species \nvs. Ground temperature",
x = "Ground temperature (\u00B0C)", y = "Raw number of ants per species")+
theme_bw()
# Araw nts vs. Soil Moisture
tmp_raw_ant_moist_plot <- ggplot(na.omit(env_analysis), aes(x = vol_moist_mean, y = individuals)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
labs(title = "Raw ant count per species \nvs. Soil moisture",
x = "Volumetric soil moisture (%)", y = "Raw number of ants per species")+
theme_bw()
# raw Ants vs. Impervious Surface
tmp_raw_ant_imp_plot <- ggplot(na.omit(env_analysis), aes(x = imp_mean, y = individuals)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "darkred") +
labs(title = "Raw ant count per species \nvs. Impervious surface cover",
x = "Impervious surface cover (%)", y = "Raw number of ants per species")+
theme_bw()
# combine into 1 plot
(tmp_raw_ant_contpred_plot <- tmp_raw_ant_t2_plot + tmp_raw_ant_moist_plot +
tmp_raw_ant_imp_plot + plot_layout(nrow = 1)) # Arrange in 1 row
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# scaled ants vs. ground temperature
tmp_scaled_ant_t2_plot <- ggplot(na.omit(env_analysis), aes(x = t2_mean, y = scaled_species_total)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "lightblue") +
labs(title = "Scaled ant count per species \nvs. Ground temperature",
x = "Ground temperature (\u00B0C)", y = "Scaled number of ants per species")+
theme_bw()
# scaled Ants vs. Soil Moisture
tmp_scaled_ant_moist_plot <- ggplot(na.omit(env_analysis), aes(x = vol_moist_mean, y = scaled_species_total)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
labs(title = "Scaled ant count per species \nvs. Soil moisture",
x = "Volumetric soil moisture (%)", y = "Scaled number of ants per species")+
theme_bw()
# scaled Ants vs. Impervious Surface
tmp_scaled_ant_imp_plot <- ggplot(na.omit(env_analysis), aes(x = imp_mean, y = scaled_species_total)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "darkred") +
labs(title = "Scaled ant count per species \nvs. Impervious surface cover",
x = "Impervious surface cover (%)", y = "Scaled number of ants per species")+
theme_bw()
# combine into 1 plot
(tmp_scaled_ant_contpred_plot <- tmp_scaled_ant_t2_plot + tmp_scaled_ant_moist_plot +
tmp_scaled_ant_imp_plot + plot_layout(nrow = 1)) # Arrange in 1 row
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# distribution of cloud cover across exp
ggplot(na.omit(cut_env_data), aes(x = factor(clouds, levels = c(
"no clouds", "lightly cloudy", "very cloudy")))) +
geom_bar(fill = "lightblue") +
labs(x = "Cloud cover", y = "Number of experiments") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme_bw()
# distribution of wind across exp
ggplot(na.omit(cut_env_data), aes(x = factor(wind, levels = c(
"no wind", "weak wind", "strong wind")))) +
geom_bar(fill = "lightblue") +
labs(x = "Wind", y = "Number of experiments") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme_bw()
# distribution of shade across exp
ggplot(na.omit(cut_env_data), aes(x = factor(shade, levels = c(
"no shade", "half-shade", "full shade")))) +
geom_bar(fill = "lightblue") +
labs(x = "Shade", y = "Number of experiments") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme_bw()
# distribution of rain across exp
ggplot(na.omit(cut_env_data), aes(x = factor(rain))) +
geom_bar(fill = "lightblue") +
labs(x = "Rain", y = "Number of experiments") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme_bw()
# scaled ant count vs clouds
ggplot(na.omit(env_analysis), aes(x = clouds, y = scaled_species_total)) +
geom_boxplot(fill = "lightblue") +
labs(x = "Cloud cover", y = "Scaled number of ants per species") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# scaled ant count vs wind
ggplot(na.omit(env_analysis), aes(x = wind, y = scaled_species_total)) +
geom_boxplot(fill = "lightblue") +
labs(x = "Wind", y = "Scaled number of ants per species") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# scaled ant count vs shade
ggplot(na.omit(env_analysis), aes(x = shade, y = scaled_species_total)) +
geom_boxplot(fill = "lightblue") +
labs(x = "Shade", y = "Scaled number of ants per species") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# distribution of experiment date
ggplot(na.omit(env_analysis), aes(x = relative_day)) +
geom_histogram(bins = 25, fill = "skyblue", color = "black") +
labs(x = "Relative date of experiment", y = "Frequency")+
theme(plot.title = element_text(hjust = 0.5))+
theme_bw()
# distribution of experiment start time
ggplot(na.omit(env_analysis), aes(x = minutes_since_8am)) +
geom_histogram(bins = 25, fill = "skyblue", color = "black") +
labs(x = "Minutes since 8 AM", y = "Frequency")+
theme(plot.title = element_text(hjust = 0.5))+
theme_bw()
# raw Ants vs. relative day
tmp_raw_ant_day_plot <- ggplot(na.omit(env_analysis), aes(x = relative_day, y = individuals)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "darkred") +
labs(title = "Raw ant count per species \nvs. Relative date",
x = "Relative date", y = "Raw number of ants per species")+
theme_bw()
# raw Ants vs. time of day
tmp_raw_ant_time_plot <- ggplot(na.omit(env_analysis), aes(x = minutes_since_8am, y = individuals)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
labs(title = "Raw ant count per species \nvs. Time of day",
x = "Minutes since 8 AM", y = "Raw number of ants per species")+
theme_bw()
# combine into 1 plot
(tmp_raw_ant_timedate_plot <- tmp_raw_ant_day_plot + tmp_raw_ant_time_plot + plot_layout(nrow = 1)) # Arrange in 1 row
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# scaled Ants vs. relative day
tmp_scaled_ant_day_plot <- ggplot(na.omit(env_analysis), aes(x = relative_day, y = individuals)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "darkred") +
labs(title = "Scaled ant count per species \nvs. Relative date",
x = "Relative date", y = "Scaled number of ants per species")+
theme_bw()
# scaled Ants vs. time of day
tmp_scaled_ant_time_plot <- ggplot(na.omit(env_analysis), aes(x = minutes_since_8am, y = individuals)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
labs(title = "Scaled ant count per species \nvs. Time of day",
x = "Minutes since 8 AM", y = "Scaled number of ants per species")+
theme_bw()
# combine into 1 plot
(tmp_scaled_ant_timedate_plot <- tmp_scaled_ant_day_plot + tmp_scaled_ant_time_plot +
plot_layout(nrow = 1)) # Arrange in 1 row
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
It is apparent that since my data contains a very high amount of zero values, the plots of the raw and scaled ant counts are not very informative in terms of trends in the data.
I plot the distribution of the amount of ants found per site.
# create df for plots
tmp_site_ind <- left_join(cut_env_data, sp_totals, by = "sample_ID")
tmp_site_counts <- cut_env_data %>%
count(site) %>% # Count experiments per site
left_join(select(cut_env_data, site, city), by = "site") %>%
distinct(site, .keep_all = TRUE) # Keep only one row per site
# distribution of sites
ggplot(na.omit(tmp_site_counts), aes(x = factor(site, levels = unique(env_analysis$site)), y = n, fill = city)) +
geom_bar(stat = "identity") +
scale_fill_paletteer_d("nationalparkcolors::Acadia") + # Apply Acadia discrete color scale
coord_flip() + # Flips the axes for better readability
labs(title = "Number of Experiments per Site",
x = "Site", y = "Number of Experiments") +
theme_classic2() +
theme(legend.text = element_text(size = 10), # set legend text
legend.title = element_text(size = 12, hjust = 0.5, vjust = 2), # Smaller legend title
legend.key.size = unit(0.4, "cm"), # Makes the legend squares smaller
axis.title.x = element_text(margin = margin(r = 10), size = 12), # adjust distance from axis title to axis, set text size
axis.title.y = element_text(margin = margin(r = 10), size = 12)) # adjust distance from axis title to axis, center title
# number of ants per site
(tmp_ind_site_plot <- ggplot(na.omit(tmp_site_ind), aes(x=site, y=total_ants_sp, fill = site))+
geom_col()+
scale_fill_viridis_d()+
labs(y = "Raw number of ants") +
theme_classic()+
theme(axis.text.x = element_text(angle = 45, hjust=1, size = 8),
legend.position = "none",
axis.title.x = element_blank(), # adjust distance from axis title to axis
axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5)) # adjust distance from axis title to axis, center title
)
rm(list = ls(pattern = "^tmp_"))
I am working with count data and multiple explanatory variables (both quantitative and categorical), as well as random effects. Additionally, my scaling process for the response variable at species level has added a lot of structural zeros to my data, since for each experiment there are many species which were not found. Thus, a zero-inflated model is necessary.
I try to fit a zero-inflated binomial (ZIB) GLMM with the number of ants of each species on each experiment representing a “success” and the maximum number of ants for each species across experiments - the number of ants of each species on each experiment as a “failure”. I also try to fit a zero-inflated Poisson (ZIP) GLMM with the number of ants of each species on each experiment as the response variable, and the log of the maximum number of ants of each species as an offset.
I first fit a base model with all of the predictor variables described in section 1.1. I fit both a ZIB GLMM and a ZIP GLMM and compare model performance and diagnostic checks to choose the best fitting model.
Zero-Inflated Binomial GLMM
# zero-inflated binomial model
tmp_model1_zib <- glmmTMB(cbind(individuals, species_max - individuals) ~
t2_mean + vol_moist_mean + imp_mean +
minutes_since_8am + relative_day +
clouds_numeric + wind_numeric + shade_numeric +
(1 | site) + (1 | sp_abbr), # random effects
data = env_analysis,
ziformula = ~ 1, # Zero-inflation
family = binomial) # Binomial distribution
# show anova table, ensure the model has converged without issues
Anova(tmp_model1_zib, type = "II")
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: cbind(individuals, species_max - individuals)
## Chisq Df Pr(>Chisq)
## t2_mean 17.609 1 2.713e-05 ***
## vol_moist_mean 84.427 1 < 2.2e-16 ***
## imp_mean 95.417 1 < 2.2e-16 ***
## minutes_since_8am 189.393 1 < 2.2e-16 ***
## relative_day 29.551 1 5.445e-08 ***
## clouds_numeric 202.968 1 < 2.2e-16 ***
## wind_numeric 171.182 1 < 2.2e-16 ***
## shade_numeric 355.587 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion, multicollinearity with performance package
check_overdispersion(tmp_model1_zib) # overdispersion
## # Overdispersion test
##
## dispersion ratio = 0.656
## p-value = 0.576
## No overdispersion detected.
check_collinearity(tmp_model1_zib) # multicollinearity
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## t2_mean 2.29 [2.20, 2.40] 1.51 0.44 [0.42, 0.46]
## vol_moist_mean 1.23 [1.19, 1.28] 1.11 0.81 [0.78, 0.84]
## imp_mean 1.08 [1.05, 1.12] 1.04 0.93 [0.89, 0.95]
## minutes_since_8am 1.38 [1.34, 1.44] 1.18 0.72 [0.70, 0.75]
## relative_day 1.93 [1.85, 2.01] 1.39 0.52 [0.50, 0.54]
## clouds_numeric 1.48 [1.43, 1.54] 1.22 0.68 [0.65, 0.70]
## wind_numeric 1.53 [1.47, 1.59] 1.24 0.66 [0.63, 0.68]
## shade_numeric 1.43 [1.38, 1.48] 1.19 0.70 [0.68, 0.73]
# check models assumptions and residuals with performance package
check_model(tmp_model1_zib, verbose = T)
## `check_outliers()` does not yet support models of class `glmmTMB`.
## Using `ci_type = "gaussian"` because model is not bernoulli.
# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model1 <- simulateResiduals(fittedModel = tmp_model1_zib)
plot(tmp_sim_model1)
# Compute fitted values and Pearson residuals
tmp_fitted_vals <- fitted(tmp_model1_zib)
tmp_residuals <- residuals(tmp_model1_zib, type = "pearson")
# Create binned residuals plot
arm::binnedplot(tmp_fitted_vals, tmp_residuals)
The ZIB-GLMM shows no pattern in the residuals plot from the
DHARMa package and the binned plot from the
arm package shows no clear trend, although the binned
residuals plot from the performance package seems to show a
slight downwards trend. There is no sign of overdispersion or high
collinearity.
Zero-Inflated Poisson GLMM
# zero-inflated Poisson model
tmp_model1_zip <- glmmTMB(individuals ~ t2_mean + vol_moist_mean + imp_mean +
minutes_since_8am + relative_day +
clouds_numeric + wind_numeric + shade_numeric +
(1 | site) + (1 | sp_abbr), # random effects
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
# show anova table, ensure model converged without issues
Anova(tmp_model1_zip, type = "II")
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: individuals
## Chisq Df Pr(>Chisq)
## t2_mean 2.5383 1 0.1111
## vol_moist_mean 120.8991 1 < 2.2e-16 ***
## imp_mean 84.5972 1 < 2.2e-16 ***
## minutes_since_8am 125.0182 1 < 2.2e-16 ***
## relative_day 35.1442 1 3.062e-09 ***
## clouds_numeric 79.8822 1 < 2.2e-16 ***
## wind_numeric 48.0360 1 4.185e-12 ***
## shade_numeric 196.3327 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion, multicollinearity with performance package
check_overdispersion(tmp_model1_zip) # overdispersion
## # Overdispersion test
##
## dispersion ratio = 0.000
## p-value = 0.088
## No overdispersion detected.
check_collinearity(tmp_model1_zip) # multicollinearity
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## t2_mean 2.36 [2.26, 2.48] 1.54 0.42 [0.40, 0.44]
## vol_moist_mean 1.24 [1.20, 1.29] 1.12 0.80 [0.77, 0.83]
## imp_mean 1.06 [1.04, 1.11] 1.03 0.94 [0.90, 0.96]
## minutes_since_8am 1.38 [1.33, 1.43] 1.17 0.73 [0.70, 0.75]
## relative_day 2.14 [2.05, 2.24] 1.46 0.47 [0.45, 0.49]
## clouds_numeric 1.38 [1.33, 1.44] 1.18 0.72 [0.70, 0.75]
## wind_numeric 1.58 [1.52, 1.65] 1.26 0.63 [0.61, 0.66]
## shade_numeric 1.44 [1.39, 1.50] 1.20 0.70 [0.67, 0.72]
# check models assumptions and residuals with performance package
check_model(tmp_model1_zip, verbose = T)
## `check_outliers()` does not yet support models of class `glmmTMB`.
# Compute fitted values and Pearson residuals
tmp_fitted_vals <- fitted(tmp_model1_zip)
tmp_residuals <- residuals(tmp_model1_zip, type = "pearson")
# Create binned residuals plot
arm::binnedplot(tmp_fitted_vals, tmp_residuals)
# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model1 <- simulateResiduals(fittedModel = tmp_model1_zip)
plot(tmp_sim_model1)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
The ZIP-GLMM shows no pattern in the residuals plot from the
DHARMa package and the binned plot from the
arm package shows no clear trend. The plots and tests from
the performance package show no sign of overdispersion or
high collinearity.
compare_performance(tmp_model1_zib, tmp_model1_zip)
## When comparing models, please note that probably not all models were fit
## from same data.
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights) | BIC (weights)
## ---------------------------------------------------------------------------
## tmp_model1_zib | glmmTMB | 9904.4 (<.001) | 9904.5 (<.001) | 9982.4 (<.001)
## tmp_model1_zip | glmmTMB | 6526.9 (>.999) | 6527.0 (>.999) | 6603.6 (>.999)
##
## Name | R2 (cond.) | R2 (marg.) | ICC | RMSE | Sigma | Log_loss
## ----------------------------------------------------------------------------
## tmp_model1_zib | 0.997 | 0.234 | 0.996 | 0.089 | 1.000 |
## tmp_model1_zip | 0.565 | 0.133 | 0.498 | 14.683 | 1.000 |
##
## Name | Score_log | Score_spherical
## --------------------------------------------
## tmp_model1_zib | |
## tmp_model1_zip | -0.705 | 0.015
Comparing the model fit of both models shows that the ZIP-GLMM performs much better with a significantly lower AIC, AICc, and BIC. the ZIB-GLMM explains more of the overall variance (both higher conditional R2 and marginal R2), but the ZIB model has an extremely high ICC (0.996), suggesting nearly all of the variance is explained by the random effects. The ZIP model has a lower explanatory power but much lower ICC (0.498), meaning the fixed effects contribute more to the explanation of overall variance. The ZIP model has higher model error (RMSE = 14.683), meaning the predictions are less precise.
Overall, the zero-inflated Poisson GLMM performs better than the zero-inflated binomial GLMM. The rest of the analysis will be conducted with the ZIP-GLMM.
Categorical variables
Now that I have fit the base model, I can test whether the model performs better with the categorical variables as factors or numerically encoded. I fit a model with the categorical variables as factors instead of numerically encoded variables and compare the performance with the base model.
# fit ZIP with categorical predictors as factors
tmp_model1_cat <- glmmTMB(individuals ~ t2_mean + vol_moist_mean + imp_mean +
minutes_since_8am + relative_day +
clouds + wind + shade + # categorical predictors as factors
(1 | site) + (1 | sp_abbr), # random effects
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
# show summary table to ensure the model has converged (no warnings)
Anova(tmp_model1_cat, type = "II")
## Warning in printHypothesis(L, rhs, names(b)): one or more coefficients in the hypothesis include
## arithmetic operators in their names;
## the printed representation of the hypothesis will be omitted
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: individuals
## Chisq Df Pr(>Chisq)
## t2_mean 1.1605 1 0.281354
## vol_moist_mean 6.8311 1 0.008958 **
## imp_mean 76.4498 1 < 2.2e-16 ***
## minutes_since_8am 308.2892 1 < 2.2e-16 ***
## relative_day 158.7223 1 < 2.2e-16 ***
## clouds 394.1585 2 < 2.2e-16 ***
## wind 521.0222 2 < 2.2e-16 ***
## shade 401.0190 2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(tmp_model1_cat)
## Family: poisson ( log )
## Formula:
## individuals ~ t2_mean + vol_moist_mean + imp_mean + minutes_since_8am +
## relative_day + clouds + wind + shade + (1 | site) + (1 | sp_abbr)
## Zero inflation: ~1
## Data: na.omit(env_analysis)
## Offset: log(species_max)
##
## AIC BIC logLik deviance df.resid
## 5553.5 5649.4 -2761.7 5523.5 4413
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## site (Intercept) 7.653 2.766
## sp_abbr (Intercept) 2.086 1.444
## Number of obs: 4428, groups: site, 33; sp_abbr, 27
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2120971 0.6632140 1.828 0.06761 .
## t2_mean 0.0089264 0.0082861 1.077 0.28135
## vol_moist_mean 0.0098764 0.0037788 2.614 0.00896 **
## imp_mean -0.0421326 0.0048187 -8.744 < 2e-16 ***
## minutes_since_8am 0.0034950 0.0001991 17.558 < 2e-16 ***
## relative_day -0.0219700 0.0017439 -12.599 < 2e-16 ***
## cloudslightly cloudy -1.9000674 0.0969210 -19.604 < 2e-16 ***
## cloudsvery cloudy -1.9791487 0.1072728 -18.450 < 2e-16 ***
## windweak wind -1.0008130 0.0841000 -11.900 < 2e-16 ***
## windstrong wind 0.3209682 0.1034862 3.102 0.00193 **
## shadehalf-shade -1.2866773 0.0658158 -19.550 < 2e-16 ***
## shadefull shade -1.2112848 0.0829355 -14.605 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Zero-inflation model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.35310 0.08306 28.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion with performance package
check_overdispersion(tmp_model1_cat)
## # Overdispersion test
##
## dispersion ratio = 0.000
## p-value = 0.016
## Underdispersion detected.
# check models assumptions and residuals with performance package
check_model(tmp_model1_cat, verbose = T)
## `check_outliers()` does not yet support models of class `glmmTMB`.
# Compute fitted values and Pearson residuals
tmp_fitted_vals <- fitted(tmp_model1_cat)
tmp_residuals <- residuals(tmp_model1_cat, type = "pearson")
# Create binned residuals plot
arm::binnedplot(tmp_fitted_vals, tmp_residuals)
# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model1 <- simulateResiduals(fittedModel = tmp_model1_cat)
plot(tmp_sim_model1)
# compare performance of base model and model with categorical var as factors
compare_performance(tmp_model1_zip, tmp_model1_cat) # base model does better
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights) | BIC (weights)
## ---------------------------------------------------------------------------
## tmp_model1_zip | glmmTMB | 6526.9 (<.001) | 6527.0 (<.001) | 6603.6 (<.001)
## tmp_model1_cat | glmmTMB | 5553.5 (>.999) | 5553.6 (>.999) | 5649.4 (>.999)
##
## Name | R2 (cond.) | R2 (marg.) | ICC | RMSE | Sigma | Score_log | Score_spherical
## -----------------------------------------------------------------------------------------------
## tmp_model1_zip | 0.565 | 0.133 | 0.498 | 14.683 | 1.000 | -0.705 | 0.015
## tmp_model1_cat | 0.521 | 0.135 | 0.446 | 97.291 | 1.000 | -0.591 | 0.015
The ZIP model with the categorical variables as factors shows no
pattern in the residuals plot from the DHARMa package and
the Q-Q plot is well aligned with the 1:1 line. The binned plot from the
arm package shows no clear trend. The plots and tests from
the performance package show no sign high collinearity, but
a slightly significant underdispersion (p = 0.016). Since the Q-Q plot
and residuals plots look fine and the overdispersion test is not highly
significant, this should not be a large issue.
Comparing model performance show that the categorical model performs much better in terms of AIC, AICc and BIC than the base model. I continue the analysis with the categorical model.
Interactions
It is possible that some interactions are at play, so I try different model configurations with interactions.
# model interaction between temperature and impervious surface
tmp_model1_int_imp <- glmmTMB(individuals ~ t2_mean * imp_mean + vol_moist_mean +
minutes_since_8am + relative_day +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_imp)
## Model has interaction terms. VIFs might be inflated.
## Try to center the variables used for the interaction, or check
## multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## imp_mean 3.68 [ 3.50, 3.87] 1.92 0.27 [0.26, 0.29]
## vol_moist_mean 1.45 [ 1.40, 1.51] 1.20 0.69 [0.66, 0.72]
## minutes_since_8am 3.99 [ 3.79, 4.20] 2.00 0.25 [0.24, 0.26]
## relative_day 2.76 [ 2.63, 2.90] 1.66 0.36 [0.35, 0.38]
## clouds 2.95 [ 2.81, 3.10] 1.72 0.34 [0.32, 0.36]
## wind 2.86 [ 2.73, 3.01] 1.69 0.35 [0.33, 0.37]
## shade 2.15 [ 2.06, 2.25] 1.47 0.47 [0.44, 0.49]
##
## Moderate Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## t2_mean 8.06 [ 7.63, 8.51] 2.84 0.12 [0.12, 0.13]
##
## High Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## t2_mean:imp_mean 11.21 [10.60, 11.86] 3.35 0.09 [0.08, 0.09]
# model interaction between temperature and date
tmp_model1_int_date <- glmmTMB(individuals ~ t2_mean * relative_day + vol_moist_mean +
minutes_since_8am + imp_mean +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_date)
## Model has interaction terms. VIFs might be inflated.
## Try to center the variables used for the interaction, or check
## multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## vol_moist_mean 2.18 [ 2.09, 2.28] 1.48 0.46
## minutes_since_8am 2.07 [ 1.98, 2.16] 1.44 0.48
## imp_mean 1.19 [ 1.16, 1.24] 1.09 0.84
## clouds 4.70 [ 4.46, 4.96] 2.17 0.21
## wind 3.79 [ 3.60, 3.99] 1.95 0.26
## shade 2.28 [ 2.18, 2.38] 1.51 0.44
## Tolerance 95% CI
## [0.44, 0.48]
## [0.46, 0.50]
## [0.81, 0.87]
## [0.20, 0.22]
## [0.25, 0.28]
## [0.42, 0.46]
##
## High Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## t2_mean 51.98 [ 49.05, 55.10] 7.21 0.02
## relative_day 183.19 [172.78, 194.23] 13.53 5.46e-03
## t2_mean:relative_day 349.97 [330.05, 371.09] 18.71 2.86e-03
## Tolerance 95% CI
## [0.02, 0.02]
## [0.01, 0.01]
## [0.00, 0.00]
# model interaction between temperature and time of day
tmp_model1_int_time <- glmmTMB(individuals ~ t2_mean * minutes_since_8am + vol_moist_mean +
relative_day + imp_mean +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_time)
## Model has interaction terms. VIFs might be inflated.
## Try to center the variables used for the interaction, or check
## multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## vol_moist_mean 1.52 [ 1.47, 1.59] 1.23 0.66 [0.63, 0.68]
## relative_day 2.82 [ 2.69, 2.96] 1.68 0.35 [0.34, 0.37]
## imp_mean 1.13 [ 1.10, 1.17] 1.06 0.88 [0.85, 0.91]
## clouds 3.62 [ 3.44, 3.80] 1.90 0.28 [0.26, 0.29]
## wind 2.90 [ 2.76, 3.05] 1.70 0.34 [0.33, 0.36]
## shade 2.18 [ 2.08, 2.28] 1.48 0.46 [0.44, 0.48]
##
## High Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## t2_mean 16.10 [ 15.21, 17.05] 4.01 0.06
## minutes_since_8am 83.75 [ 79.00, 88.78] 9.15 0.01
## t2_mean:minutes_since_8am 106.39 [100.36, 112.80] 10.31 9.40e-03
## Tolerance 95% CI
## [0.06, 0.07]
## [0.01, 0.01]
## [0.01, 0.01]
# model all 3 interactions
tmp_model1_int_all <- glmmTMB(individuals ~ t2_mean * minutes_since_8am + t2_mean * relative_day +
t2_mean * imp_mean + vol_moist_mean +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_all)
## Model has interaction terms. VIFs might be inflated.
## Try to center the variables used for the interaction, or check
## multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## imp_mean 3.75 [ 3.56, 3.94] 1.94 0.27 [0.25, 0.28]
## vol_moist_mean 1.67 [ 1.60, 1.74] 1.29 0.60 [0.57, 0.62]
## clouds 4.46 [ 4.24, 4.70] 2.11 0.22 [0.21, 0.24]
## wind 3.19 [ 3.04, 3.35] 1.79 0.31 [0.30, 0.33]
## shade 2.33 [ 2.23, 2.44] 1.53 0.43 [0.41, 0.45]
##
## High Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## t2_mean 38.88 [ 36.70, 41.20] 6.24 0.03
## minutes_since_8am 98.99 [ 93.38, 104.94] 9.95 0.01
## relative_day 86.07 [ 81.20, 91.25] 9.28 0.01
## t2_mean:minutes_since_8am 119.53 [112.75, 126.72] 10.93 8.37e-03
## t2_mean:relative_day 170.73 [161.03, 181.01] 13.07 5.86e-03
## t2_mean:imp_mean 10.78 [ 10.19, 11.40] 3.28 0.09
## Tolerance 95% CI
## [0.02, 0.03]
## [0.01, 0.01]
## [0.01, 0.01]
## [0.01, 0.01]
## [0.01, 0.01]
## [0.09, 0.10]
# model both imperivous surface and date interactions
tmp_model1_int_impday <- glmmTMB(individuals ~ t2_mean * relative_day +
t2_mean * imp_mean + vol_moist_mean +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_impday)
## Model has interaction terms. VIFs might be inflated.
## Try to center the variables used for the interaction, or check
## multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## imp_mean 2.62 [ 2.50, 2.75] 1.62 0.38 [0.36, 0.40]
## vol_moist_mean 1.79 [ 1.72, 1.87] 1.34 0.56 [0.53, 0.58]
## clouds 3.52 [ 3.35, 3.71] 1.88 0.28 [0.27, 0.30]
## wind 3.54 [ 3.37, 3.73] 1.88 0.28 [0.27, 0.30]
## shade 2.11 [ 2.02, 2.21] 1.45 0.47 [0.45, 0.49]
##
## Moderate Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## t2_mean:imp_mean 6.05 [ 5.73, 6.38] 2.46 0.17 [0.16, 0.17]
##
## High Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## t2_mean 27.34 [ 25.81, 28.96] 5.23 0.04
## relative_day 124.18 [117.13, 131.66] 11.14 8.05e-03
## t2_mean:relative_day 214.41 [202.22, 227.34] 14.64 4.66e-03
## Tolerance 95% CI
## [0.03, 0.04]
## [0.01, 0.01]
## [0.00, 0.00]
# compare performance of base model and interactions
compare_performance(tmp_model1_cat, tmp_model1_int_imp, tmp_model1_int_date, tmp_model1_int_time, tmp_model1_int_all, tmp_model1_int_impday)
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights)
## -----------------------------------------------------------------
## tmp_model1_cat | glmmTMB | 5553.5 (<.001) | 5553.6 (<.001)
## tmp_model1_int_imp | glmmTMB | 5512.3 (<.001) | 5512.4 (<.001)
## tmp_model1_int_date | glmmTMB | 5368.2 (<.001) | 5368.3 (<.001)
## tmp_model1_int_time | glmmTMB | 5555.5 (<.001) | 5555.6 (<.001)
## tmp_model1_int_all | glmmTMB | 5352.3 (>.999) | 5352.5 (>.999)
## tmp_model1_int_impday | glmmTMB | 5470.6 (<.001) | 5470.7 (<.001)
##
## Name | BIC (weights) | R2 (cond.) | R2 (marg.) | ICC
## ------------------------------------------------------------------------
## tmp_model1_cat | 5649.4 (<.001) | 0.521 | 0.135 | 0.446
## tmp_model1_int_imp | 5614.6 (<.001) | 0.530 | 0.133 | 0.458
## tmp_model1_int_date | 5470.5 (0.176) | 0.664 | 0.186 | 0.587
## tmp_model1_int_time | 5657.8 (<.001) | 0.521 | 0.135 | 0.446
## tmp_model1_int_all | 5467.4 (0.824) | 0.639 | 0.197 | 0.550
## tmp_model1_int_impday | 5572.9 (<.001) | 0.606 | 0.155 | 0.534
##
## Name | RMSE | Sigma | Score_log | Score_spherical
## --------------------------------------------------------------------
## tmp_model1_cat | 97.291 | 1.000 | -0.591 | 0.015
## tmp_model1_int_imp | 37.105 | 1.000 | -0.587 | 0.015
## tmp_model1_int_date | 16.454 | 1.000 | -0.571 | 0.015
## tmp_model1_int_time | 96.156 | 1.000 | -0.591 | 0.015
## tmp_model1_int_all | 16.016 | 1.000 | -0.570 | 0.015
## tmp_model1_int_impday | 15.130 | 1.000 | -0.584 | 0.015
Comparing model performance shows that the model including all 3 interaction terms performs significantly better than all other models. However, including interaction terms highly increases multicollinearity and all models show variables with VIFs > 10, which is problematic.
I first try to center all variables included in interaction terms around the variable means and compare the model fits again.
# centering the variables around the mean (centered var = var - mean(var))
env_analysis <- env_analysis %>%
mutate(
cent_t2 = scale(t2_mean, center = TRUE, scale = FALSE),
cent_imp = scale(imp_mean, center = TRUE, scale = FALSE),
cent_minutes_since_8am = scale(minutes_since_8am, center = TRUE, scale = FALSE),
cent_relative_day = scale(relative_day, center = TRUE, scale = FALSE))
# model interaction between temperature and impervious surface
tmp_model1_int_imp_c <- glmmTMB(individuals ~ cent_t2 * cent_imp + vol_moist_mean +
minutes_since_8am + relative_day +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_imp_c)
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## cent_t2 2.43 [2.32, 2.55] 1.56 0.41 [0.39, 0.43]
## cent_imp 1.12 [1.09, 1.17] 1.06 0.89 [0.86, 0.92]
## vol_moist_mean 1.45 [1.40, 1.51] 1.21 0.69 [0.66, 0.71]
## minutes_since_8am 3.94 [3.75, 4.15] 1.99 0.25 [0.24, 0.27]
## relative_day 2.76 [2.63, 2.89] 1.66 0.36 [0.35, 0.38]
## clouds 2.94 [2.81, 3.09] 1.72 0.34 [0.32, 0.36]
## wind 2.86 [2.73, 3.01] 1.69 0.35 [0.33, 0.37]
## shade 2.15 [2.06, 2.25] 1.47 0.47 [0.44, 0.49]
## cent_t2:cent_imp 3.09 [2.94, 3.25] 1.76 0.32 [0.31, 0.34]
# model interaction between temperature and date
tmp_model1_int_date_c <- glmmTMB(individuals ~ cent_t2 * cent_relative_day + vol_moist_mean +
minutes_since_8am + imp_mean +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_date_c)
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## cent_t2 3.65 [3.47, 3.84] 1.91 0.27
## cent_relative_day 3.33 [3.17, 3.50] 1.83 0.30
## vol_moist_mean 1.65 [1.59, 1.72] 1.28 0.61
## minutes_since_8am 2.05 [1.96, 2.14] 1.43 0.49
## imp_mean 1.16 [1.13, 1.20] 1.08 0.86
## clouds 3.29 [3.14, 3.46] 1.82 0.30
## wind 3.06 [2.92, 3.22] 1.75 0.33
## shade 2.26 [2.16, 2.37] 1.50 0.44
## cent_t2:cent_relative_day 3.82 [3.63, 4.02] 1.95 0.26
## Tolerance 95% CI
## [0.26, 0.29]
## [0.29, 0.32]
## [0.58, 0.63]
## [0.47, 0.51]
## [0.83, 0.89]
## [0.29, 0.32]
## [0.31, 0.34]
## [0.42, 0.46]
## [0.25, 0.28]
# model interaction between temperature and time of day
tmp_model1_int_time_c <- glmmTMB(individuals ~ cent_t2 * cent_minutes_since_8am + vol_moist_mean +
relative_day + imp_mean +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_time_c)
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## cent_t2 4.21 [4.00, 4.44] 2.05 0.24
## cent_minutes_since_8am 2.22 [2.13, 2.33] 1.49 0.45
## vol_moist_mean 1.51 [1.45, 1.57] 1.23 0.66
## relative_day 2.85 [2.72, 3.00] 1.69 0.35
## imp_mean 1.13 [1.10, 1.18] 1.06 0.88
## clouds 3.70 [3.52, 3.89] 1.92 0.27
## wind 2.90 [2.76, 3.05] 1.70 0.34
## shade 2.16 [2.07, 2.26] 1.47 0.46
## cent_t2:cent_minutes_since_8am 3.78 [3.60, 3.98] 1.95 0.26
## Tolerance 95% CI
## [0.23, 0.25]
## [0.43, 0.47]
## [0.64, 0.69]
## [0.33, 0.37]
## [0.85, 0.91]
## [0.26, 0.28]
## [0.33, 0.36]
## [0.44, 0.48]
## [0.25, 0.28]
# model all 3 interactions
tmp_model1_int_all_c <- glmmTMB(individuals ~ cent_t2 * cent_minutes_since_8am + cent_t2 * cent_relative_day +
cent_t2 * cent_imp + vol_moist_mean +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_all_c)
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## cent_minutes_since_8am 4.09 [3.89, 4.31] 2.02 0.24
## cent_relative_day 3.75 [3.57, 3.95] 1.94 0.27
## cent_imp 1.16 [1.13, 1.21] 1.08 0.86
## vol_moist_mean 1.66 [1.60, 1.74] 1.29 0.60
## clouds 4.54 [4.31, 4.78] 2.13 0.22
## wind 3.19 [3.04, 3.36] 1.79 0.31
## shade 2.33 [2.22, 2.44] 1.53 0.43
## cent_t2:cent_minutes_since_8am 4.43 [4.21, 4.67] 2.10 0.23
## cent_t2:cent_relative_day 4.02 [3.82, 4.23] 2.00 0.25
## cent_t2:cent_imp 3.23 [3.07, 3.39] 1.80 0.31
## Tolerance 95% CI
## [0.23, 0.26]
## [0.25, 0.28]
## [0.83, 0.89]
## [0.58, 0.63]
## [0.21, 0.23]
## [0.30, 0.33]
## [0.41, 0.45]
## [0.21, 0.24]
## [0.24, 0.26]
## [0.29, 0.33]
##
## Moderate Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## cent_t2 5.43 [5.15, 5.73] 2.33 0.18 [0.17, 0.19]
# model both significant interactions
tmp_model1_int_impday_c <- glmmTMB(individuals ~ cent_t2 * cent_relative_day +
cent_t2 * cent_imp + vol_moist_mean +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
check_collinearity(tmp_model1_int_impday_c)
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## cent_t2 3.49 [3.33, 3.68] 1.87 0.29
## cent_relative_day 3.16 [3.00, 3.32] 1.78 0.32
## cent_imp 1.16 [1.12, 1.20] 1.08 0.86
## vol_moist_mean 1.52 [1.46, 1.58] 1.23 0.66
## clouds 2.31 [2.21, 2.42] 1.52 0.43
## wind 2.78 [2.65, 2.92] 1.67 0.36
## shade 2.11 [2.02, 2.21] 1.45 0.47
## cent_t2:cent_relative_day 3.66 [3.48, 3.85] 1.91 0.27
## cent_t2:cent_imp 1.58 [1.52, 1.65] 1.26 0.63
## Tolerance 95% CI
## [0.27, 0.30]
## [0.30, 0.33]
## [0.83, 0.89]
## [0.63, 0.68]
## [0.41, 0.45]
## [0.34, 0.38]
## [0.45, 0.49]
## [0.26, 0.29]
## [0.61, 0.66]
# compare performance of base model and interactions
compare_performance(tmp_model1_cat, tmp_model1_int_imp_c, tmp_model1_int_date_c, tmp_model1_int_time_c, tmp_model1_int_all_c, tmp_model1_int_impday_c)
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights)
## -------------------------------------------------------------------
## tmp_model1_cat | glmmTMB | 5553.5 (<.001) | 5553.6 (<.001)
## tmp_model1_int_imp_c | glmmTMB | 5512.3 (<.001) | 5512.4 (<.001)
## tmp_model1_int_date_c | glmmTMB | 5368.2 (<.001) | 5368.3 (<.001)
## tmp_model1_int_time_c | glmmTMB | 5555.5 (<.001) | 5555.6 (<.001)
## tmp_model1_int_all_c | glmmTMB | 5352.3 (>.999) | 5352.5 (>.999)
## tmp_model1_int_impday_c | glmmTMB | 5470.6 (<.001) | 5470.7 (<.001)
##
## Name | BIC (weights) | R2 (cond.) | R2 (marg.) | ICC
## --------------------------------------------------------------------------
## tmp_model1_cat | 5649.4 (<.001) | 0.521 | 0.135 | 0.446
## tmp_model1_int_imp_c | 5614.6 (<.001) | 0.530 | 0.133 | 0.458
## tmp_model1_int_date_c | 5470.5 (0.176) | 0.664 | 0.186 | 0.587
## tmp_model1_int_time_c | 5657.8 (<.001) | 0.521 | 0.135 | 0.446
## tmp_model1_int_all_c | 5467.4 (0.824) | 0.639 | 0.197 | 0.550
## tmp_model1_int_impday_c | 5572.9 (<.001) | 0.606 | 0.155 | 0.534
##
## Name | RMSE | Sigma | Score_log | Score_spherical
## ----------------------------------------------------------------------
## tmp_model1_cat | 97.291 | 1.000 | -0.591 | 0.015
## tmp_model1_int_imp_c | 37.105 | 1.000 | -0.587 | 0.015
## tmp_model1_int_date_c | 16.454 | 1.000 | -0.571 | 0.015
## tmp_model1_int_time_c | 96.156 | 1.000 | -0.591 | 0.015
## tmp_model1_int_all_c | 16.016 | 1.000 | -0.570 | 0.015
## tmp_model1_int_impday_c | 15.130 | 1.000 | -0.584 | 0.015
Centering the predictors involved in interactions has fixed the collinearity issues, and all models show predictors with low to moderate correlation. Comparing model performance shows that the model which includes all interaction terms still performs much better than all other models with a much lower AIC, AICc, and BIC.
I can now check model assumptions for the model which includes all interaction terms.
# check assumptions for model with all interaction terms and centered predictors
# Check for overdispersion with performance package
check_overdispersion(tmp_model1_int_all_c)
## # Overdispersion test
##
## dispersion ratio = 0.000
## p-value = < 0.001
## Underdispersion detected.
check_collinearity(tmp_model1_int_all_c)
## # Check for Multicollinearity
##
## * conditional component:
##
## Low Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance
## cent_minutes_since_8am 4.09 [3.89, 4.31] 2.02 0.24
## cent_relative_day 3.75 [3.57, 3.95] 1.94 0.27
## cent_imp 1.16 [1.13, 1.21] 1.08 0.86
## vol_moist_mean 1.66 [1.60, 1.74] 1.29 0.60
## clouds 4.54 [4.31, 4.78] 2.13 0.22
## wind 3.19 [3.04, 3.36] 1.79 0.31
## shade 2.33 [2.22, 2.44] 1.53 0.43
## cent_t2:cent_minutes_since_8am 4.43 [4.21, 4.67] 2.10 0.23
## cent_t2:cent_relative_day 4.02 [3.82, 4.23] 2.00 0.25
## cent_t2:cent_imp 3.23 [3.07, 3.39] 1.80 0.31
## Tolerance 95% CI
## [0.23, 0.26]
## [0.25, 0.28]
## [0.83, 0.89]
## [0.58, 0.63]
## [0.21, 0.23]
## [0.30, 0.33]
## [0.41, 0.45]
## [0.21, 0.24]
## [0.24, 0.26]
## [0.29, 0.33]
##
## Moderate Correlation
##
## Term VIF VIF 95% CI Increased SE Tolerance Tolerance 95% CI
## cent_t2 5.43 [5.15, 5.73] 2.33 0.18 [0.17, 0.19]
# check models assumptions and residuals with performance package
check_model(tmp_model1_int_all_c, verbose = T)
## `check_outliers()` does not yet support models of class `glmmTMB`.
# Compute fitted values and Pearson residuals
tmp_fitted_vals <- fitted(tmp_model1_int_all_c)
tmp_residuals <- residuals(tmp_model1_int_all_c, type = "pearson")
# Create binned residuals plot
arm::binnedplot(tmp_fitted_vals, tmp_residuals)
# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model1 <- simulateResiduals(fittedModel = tmp_model1_int_all_c)
plot(tmp_sim_model1)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
The model with all interaction terms and centered predictors shows no
pattern in the residuals plot from the DHARMa package and
the Q-Q plot is well aligned with the 1:1 line. The binned plot from the
arm package shows no clear trend. The plots and tests from
the performance package show no sign of high collinearity,
but a significant underdispersion (p < 0.001). Since the Q-Q plot and
residuals plots look normal, this should not be a large issue. As
described in the DHARMa package vignette, “From a technical
side, underdispersion is not as concerning as over dispersion, as it
will usually bias p-values to the conservative side”.
From the different model combinations, it appears that the best performing model is a zero-inflated Poisson GLMM with the number of ants per species as a response offset by the maximum number of ants ever found per species, which models how the different predictors affect the relative species-level ant activity. The model has 3 interaction terms: between ground temperature and impervious surface cover, between ground temperature and relative date, between ground temperature and time of day. All predictors involved in interaction terms are centered around their means.The categorical variables are modeled as factors.
# fit model with all interaction terms and centered predictors
model1 <- glmmTMB(individuals ~ cent_t2 * cent_minutes_since_8am + cent_t2 * cent_relative_day +
cent_t2 * cent_imp + vol_moist_mean +
clouds + wind + shade +
(1 | site) + (1 | sp_abbr),
offset = log(species_max),
ziformula = ~1, # zero-inflation
family = poisson, # poisson distribution
data = na.omit(env_analysis))
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
rm(list = ls(pattern = "^tmp_"))
# show model results
Anova(model1, type = "III")
## Warning in printHypothesis(L, rhs, names(b)): one or more coefficients in the hypothesis include
## arithmetic operators in their names;
## the printed representation of the hypothesis will be omitted
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: individuals
## Chisq Df Pr(>Chisq)
## (Intercept) 2.8156 1 0.0933519 .
## cent_t2 7.8600 1 0.0050540 **
## cent_minutes_since_8am 107.0816 1 < 2.2e-16 ***
## cent_relative_day 166.4223 1 < 2.2e-16 ***
## cent_imp 105.3808 1 < 2.2e-16 ***
## vol_moist_mean 7.4032 1 0.0065109 **
## clouds 314.2051 2 < 2.2e-16 ***
## wind 388.9281 2 < 2.2e-16 ***
## shade 270.2005 2 < 2.2e-16 ***
## cent_t2:cent_minutes_since_8am 7.4807 1 0.0062363 **
## cent_t2:cent_relative_day 155.3776 1 < 2.2e-16 ***
## cent_t2:cent_imp 12.1228 1 0.0004981 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model1)
## Family: poisson ( log )
## Formula:
## individuals ~ cent_t2 * cent_minutes_since_8am + cent_t2 * cent_relative_day +
## cent_t2 * cent_imp + vol_moist_mean + clouds + wind + shade +
## (1 | site) + (1 | sp_abbr)
## Zero inflation: ~1
## Data: na.omit(env_analysis)
## Offset: log(species_max)
##
## AIC BIC logLik deviance df.resid
## 5352.3 5467.4 -2658.2 5316.3 4410
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## site (Intercept) 8.016 2.831
## sp_abbr (Intercept) 1.016 1.008
## Number of obs: 4428, groups: site, 33; sp_abbr, 27
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.895e-01 5.301e-01 -1.678 0.093352 .
## cent_t2 -3.431e-02 1.224e-02 -2.804 0.005054 **
## cent_minutes_since_8am 2.918e-03 2.820e-04 10.348 < 2e-16 ***
## cent_relative_day -2.693e-02 2.087e-03 -12.900 < 2e-16 ***
## cent_imp -5.001e-02 4.871e-03 -10.266 < 2e-16 ***
## vol_moist_mean -1.140e-02 4.190e-03 -2.721 0.006511 **
## cloudslightly cloudy -2.058e+00 1.164e-01 -17.676 < 2e-16 ***
## cloudsvery cloudy -2.191e+00 1.325e-01 -16.536 < 2e-16 ***
## windweak wind -6.242e-01 9.113e-02 -6.850 7.40e-12 ***
## windstrong wind 6.868e-01 1.095e-01 6.273 3.54e-10 ***
## shadehalf-shade -1.123e+00 6.838e-02 -16.423 < 2e-16 ***
## shadefull shade -9.097e-01 8.737e-02 -10.411 < 2e-16 ***
## cent_t2:cent_minutes_since_8am -1.632e-04 5.968e-05 -2.735 0.006236 **
## cent_t2:cent_relative_day 4.666e-03 3.743e-04 12.465 < 2e-16 ***
## cent_t2:cent_imp 1.056e-03 3.033e-04 3.482 0.000498 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Zero-inflation model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.42567 0.08326 29.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# show model performance
model_performance(model1)
## # Indices of model performance
##
## AIC | AICc | BIC | R2 (cond.) | R2 (marg.) | ICC | RMSE
## -------------------------------------------------------------------------
## 5352.322 | 5352.477 | 5467.445 | 0.639 | 0.197 | 0.550 | 16.016
##
## AIC | Sigma | Score_log | Score_spherical
## ----------------------------------------------
## 5352.322 | 1.000 | -0.570 | 0.015
# Extract fixed effects and compute IRR
tmp_model1_fixed_effects <- tidy(model1, effects = "fixed") %>%
mutate(IRR = exp(estimate)) %>%
select(term, estimate, std.error, p.value, IRR)
tmp_model1_fixed_effects
## # A tibble: 16 × 5
## term estimate std.error p.value IRR
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.889 0.530 9.34e- 2 0.411
## 2 cent_t2 -0.0343 0.0122 5.05e- 3 0.966
## 3 cent_minutes_since_8am 0.00292 0.000282 4.27e- 25 1.00
## 4 cent_relative_day -0.0269 0.00209 4.47e- 38 0.973
## 5 cent_imp -0.0500 0.00487 1.01e- 24 0.951
## 6 vol_moist_mean -0.0114 0.00419 6.51e- 3 0.989
## 7 cloudslightly cloudy -2.06 0.116 6.44e- 70 0.128
## 8 cloudsvery cloudy -2.19 0.133 2.02e- 61 0.112
## 9 windweak wind -0.624 0.0911 7.40e- 12 0.536
## 10 windstrong wind 0.687 0.109 3.54e- 10 1.99
## 11 shadehalf-shade -1.12 0.0684 1.31e- 60 0.325
## 12 shadefull shade -0.910 0.0874 2.20e- 25 0.403
## 13 cent_t2:cent_minutes_since_8am -0.000163 0.0000597 6.24e- 3 1.00
## 14 cent_t2:cent_relative_day 0.00467 0.000374 1.16e- 35 1.00
## 15 cent_t2:cent_imp 0.00106 0.000303 4.98e- 4 1.00
## 16 (Intercept) 2.43 0.0833 1.28e-186 11.3
# report model results
report(model1)
## We fitted a zero-inflated poisson mixed model (estimated using ML and nlminb
## optimizer) to predict individuals with cent_t2, cent_minutes_since_8am,
## cent_relative_day, cent_imp, vol_moist_mean, clouds, wind and shade (formula:
## individuals ~ cent_t2 * cent_minutes_since_8am + cent_t2 * cent_relative_day +
## cent_t2 * cent_imp + vol_moist_mean + clouds + wind + shade). The model
## included site as random effects (formula: list(~1 | site, ~1 | sp_abbr)). The
## model's total explanatory power is substantial (conditional R2 = 0.64) and the
## part related to the fixed effects alone (marginal R2) is of 0.20. The model's
## intercept, corresponding to cent_t2 = 0, cent_minutes_since_8am = 0,
## cent_relative_day = 0, cent_imp = 0, vol_moist_mean = 0, clouds = no clouds,
## wind = no wind and shade = no shade, is at -0.89 (95% CI [-1.93, 0.15], p =
## 0.093). Within this model:
##
## - The effect of cent t2 is statistically significant and negative (beta =
## -0.03, 95% CI [-0.06, -0.01], p = 0.005; Std. beta = -0.11, 95% CI [-0.19,
## -0.02])
## - The effect of cent minutes since 8am is statistically significant and
## positive (beta = 2.92e-03, 95% CI [2.37e-03, 3.47e-03], p < .001; Std. beta =
## 0.51, 95% CI [0.45, 0.57])
## - The effect of cent relative day is statistically significant and negative
## (beta = -0.03, 95% CI [-0.03, -0.02], p < .001; Std. beta = -0.72, 95% CI
## [-0.79, -0.65])
## - The effect of cent imp is statistically significant and negative (beta =
## -0.05, 95% CI [-0.06, -0.04], p < .001; Std. beta = -1.45, 95% CI [-1.68,
## -1.22])
## - The effect of vol moist mean is statistically significant and negative (beta
## = -0.01, 95% CI [-0.02, -3.19e-03], p = 0.007; Std. beta = -0.09, 95% CI
## [-0.14, -0.04])
## - The effect of clouds [lightly cloudy] is statistically significant and
## negative (beta = -2.06, 95% CI [-2.29, -1.83], p < .001; Std. beta = -2.20, 95%
## CI [-2.29, -2.11])
## - The effect of clouds [very cloudy] is statistically significant and negative
## (beta = -2.19, 95% CI [-2.45, -1.93], p < .001; Std. beta = -2.28, 95% CI
## [-2.38, -2.18])
## - The effect of wind [weak wind] is statistically significant and negative
## (beta = -0.62, 95% CI [-0.80, -0.45], p < .001; Std. beta = -0.70, 95% CI
## [-0.76, -0.63])
## - The effect of wind [strong wind] is statistically significant and positive
## (beta = 0.69, 95% CI [0.47, 0.90], p < .001; Std. beta = 0.60, )
## - The effect of shade [half-shade] is statistically significant and negative
## (beta = -1.12, 95% CI [-1.26, -0.99], p < .001; Std. beta = -1.15, 95% CI
## [-1.25, -1.04])
## - The effect of shade [full shade] is statistically significant and negative
## (beta = -0.91, 95% CI [-1.08, -0.74], p < .001; Std. beta = -0.89, )
## - The effect of cent t2 × cent minutes since 8am is statistically significant
## and negative (beta = -1.63e-04, 95% CI [-2.80e-04, -4.63e-05], p = 0.006; Std.
## beta = -0.15, 95% CI [-0.22, -0.08])
## - The effect of cent t2 × cent relative day is statistically significant and
## positive (beta = 4.67e-03, 95% CI [3.93e-03, 5.40e-03], p < .001; Std. beta =
## 0.76, 95% CI [0.66, 0.86])
## - The effect of cent t2 × cent imp is statistically significant and positive
## (beta = 1.06e-03, 95% CI [4.62e-04, 1.65e-03], p < .001; Std. beta = 0.15, 95%
## CI [0.10, 0.21])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
# get average values of all predictors involved in interactions
mean(na.omit(env_analysis$t2_mean)) # mean ground temperature
## [1] 24.24898
mean(na.omit(env_analysis$imp_mean)) # mean impervious surface cover
## [1] 43.29771
mean(na.omit(env_analysis$minutes_since_8am)) # mean exp start time
## [1] 241.8511
mean(na.omit(env_analysis$relative_day)) # mean exp date
## [1] 59.63866
I fitted a zero-inflated Poisson generalized linear mixed model (GLMM) (estimated using ML and nlminb optimizer) to analyze the effects of environmental conditions and urbanization on species-level ant foraging activity. Ant foraging activity was modeled as the species-specific number of ants collected at the end of each experiment, offset by the species’ maximum observed number of individuals across experiments. The environmental conditions and urbanization variables included as predictor variables (fixed effects) were ground temperature, soil moisture, mean impervious surface cover in a 100m radius around the experiment, time of day, date of experiment, cloud cover, wind, and shade. Three interaction terms were included to test whether the effect of temperature varied with impervious surface cover, date of experiment, and time of day. All variables included in the interaction terms were centered around their mean values to reduce multicollinearity (formula: ant activity ~ centered temperature * centered time of day + centered temperature * centered date of experiment + centered temperature * centered impervious surface cover + soil moisture + cloud cover + wind + shade). The model included a random intercept for site and species (formulas: ~ 1|site, ~1|species).
The model’s total explanatory power is substantial (conditional R2 = 0.64) and the part related to the fixed effects alone (marginal R2) is of 0.20. The variance of the site random effect (variance = 8.016) indicates that there is substantial site-level variation in ant activity. The smaller variance of the species random effect (variance = 1.016) suggests moderate species-level differences in ant activity.
The model fit yielded an AIC of 5352.3 and a BIC of 5467.4, with a log-likelihood of -2658.2 and a deviance of 5316.3. The model included 4428 observations, with random effects for 33 sites and 27 species.
The type III Wald chi-square tests from the analysis of deviance table indicate significant effects of all predictor variables and interaction terms (see ANOVA table, p < 0.05).
Since all variables involved in the interactions are centered around their mean values, the main effect of temperature at average impervious surface cover (43.30%), average experiment start time (241.85 minutes after 8:00 AM or 12:02 PM), and average experiment date (59.64 or June 10) is significantly negative (beta = −0.03, SE = 0.01, z = -2.80, P = 0.005). However, the effect of temperature on ant foraging activity varies significantly depending on impervious surface cover, experiment start time, and experiment date. Specifically, the effect of temperature on ant foraging activity becomes significantly less negative in more urbanized areas (beta = 0.001, SE = 0.0003, z = 3.48, p = 0.0005). The effect of temperature also becomes significantly less negative in experiments done at later dates in the season (beta = 0.005 SE = 0.0004, z = 12.47, p < 2e-16). The effect of temperature becomes significantly more negative when the experiment start is later in the day (beta = -0.0002, SE = 0.00006, z = -2.74, p = 0.006).
At an average ground temperature (24.24 °C), impervious surface cover has a significantly negative impact on ant foraging activity (beta = -0.05, SE = 0.005, z = -10.27, p < 2e-16), and so does date of experiment (i.e., at average temperatures, ant foraging activity decreases later in the season) (beta = -0.03, SE = 0.002, z = -12.90, p < 2e-16). However, at average ground temperatures, ant foraging activity significantly increases as experiments are started later in the day (beta = 0.003, SE = 0.0003, z = 10.35, p < 2e-16).
Soil moisture significantly decreases ant foraging activity (beta = -0.01, SE = 0.004, z = -2.72, p = 0.007). In terms of cloud cover and compared to the reference level “no clouds”, lightly cloudy conditions significantly increase ant foraging activity (beta = 1.42, SE = 0.08, z = 17.5, p < 2e-16) while very cloudy conditions decrease ant foraging activity (beta = -0.64, SE = 0.04, z = -15.19, p < 2e-16). Compared to the absence of wind, weak wind does not significantly influence ant foraging activity (beta = -0.02, SE = 0.06, z = -0.33, p = 0.74). However, strong wind has a significant negative impact on and foraging activity (beta = -0.65, SE = 0.04, z = -16.28, p < 2e-16). Compared to fully sunny conditions (no shade), half-shade had a significant positive impact on ant foraging activity (beta = 0.68, SE = 0.05, z = 14.56, p < 2e-16) while full shade had a significant negative impact (beta = -0.45, SE = 0.04, z = -12.20, p < 2e-16).
I can plot the interaction terms to visualize how the effect of temperature on ant activity varies with impervious surface cover, time of day and date of experiment.
# get centered values for 0th, 25th, 50th, 75th, 100th percentiles
tmp_centered_imp <- quantile(env_analysis$cent_imp, probs = c(0, 0.25, 0.50, 0.75, 1), na.rm = T)
# Get predicted values for temperature at specified quantiles of impervious surface
tmp_pred_int <- ggpredict(model1, terms = c("cent_t2", paste0("cent_imp [", paste(tmp_centered_imp, collapse = ", "), "]")))
## You are calculating adjusted predictions on the population-level (i.e.
## `type = "fixed"`) for a *generalized* linear mixed model.
## This may produce biased estimates due to Jensen's inequality. Consider
## setting `bias_correction = TRUE` to correct for this bias.
## See also the documentation of the `bias_correction` argument.
# plot effect of temperature on ant activity at different urbanization levels
(tmp_plot_int_imp <- ggplot(tmp_pred_int, aes(x = x, y = predicted, color = group)) +
geom_line(size = 1) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.05, color = NA) + # adds 95% CI
labs(x = "Centered ground temperature (\u00B0C)", y = "Predicted species-level ant foraging activity\n(predicted number of individuals)",
title = "Interaction of ground temperature\n and impervious surface cover") +
scale_color_viridis_d() + # adds viridis palette
scale_fill_viridis_d() + # adds viridis palette
theme_classic2() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, size = 12, margin = margin(b = 15)),
axis.title.x = element_blank(), # remove axis title
axis.title.y = element_text(margin = margin(r = 10), size = 10)) # adjust distance from axis title to axis
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# get centered values for 0th, 25th, 50th, 75th, 100th percentiles
tmp_centered_start_times <- quantile(env_analysis$cent_minutes_since_8am, probs = c(0, 0.25, 0.50, 0.75, 1), na.rm = T)
# Get predicted values for temperature at specified quantiles of impervious surface
tmp_pred_time <- ggpredict(model1, terms = c("cent_t2", paste0("cent_minutes_since_8am [", paste(tmp_centered_start_times, collapse = ", "), "]")))
# Plot effect of temperature on ant activity at different start times
(tmp_plot_int_time <- ggplot(tmp_pred_time, aes(x = x, y = predicted, color = group)) +
geom_line(size = 1) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.1, color = NA) + # adds 95% CI
labs(x = "Centered ground temperature (\u00B0C)", y = "Predicted species-level ant foraging activity",
title = "Interaction of ground temperature\n and experiment start time") +
scale_color_viridis_d() + # adds viridis palette
scale_fill_viridis_d() + # adds viridis palette
theme_classic2() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, size = 12, margin = margin(b = 15)),
axis.title.x = element_text(margin = margin(t = 20), size = 10), # adjust distance from axis title to axis
axis.title.y = element_blank()) # remove axis title
)
# get centered values for 0th, 25th, 50th, 75th, 100th percentiles
tmp_centered_dates <- quantile(env_analysis$cent_relative_day, probs = c(0, 0.25, 0.50, 0.75, 1), na.rm = T)
# Get predicted values for temperature at specified quantiles of impervious surface
tmp_pred_date <- ggpredict(model1, terms = c("cent_t2", paste0("cent_relative_day [", paste(tmp_centered_dates, collapse = ", "), "]")))
# Define custom labels
tmp_date_labels <- c("0th", "25th", "50th", "75th", "100th")
# Plot effect of temperature on ant activity at different experiment dates
(tmp_plot_int_date <- ggplot(tmp_pred_date, aes(x = x, y = predicted, color = group)) +
geom_line(size = 1) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.05, color = NA) + # adds 95% CI
labs(title = "Interaction of ground temperature\n and date of experiment",
color = "Percentile", fill = "Percentile") +
scale_color_viridis_d(labels = tmp_date_labels) + # adds custom labels and viridis palette
scale_fill_viridis_d(labels = tmp_date_labels) + # adds custom labels and viridis palette
theme_classic2() +
theme(legend.text = element_text(size = 10), # set legend text
legend.title = element_text(size = 12, hjust = 0, vjust = 2), # Smaller legend title
legend.key.size = unit(0.5, "cm"), # Makes the legend squares smaller
plot.title = element_text(hjust = 0.5, size = 12, margin = margin(b = 15)),
axis.title.x = element_blank(), # remove axis title
axis.title.y = element_blank()) + # remove axis title
guides(fill = guide_legend(ncol = 1)) # Set legend to 1 column
)
# combine all interaction plots into 1 plot
(tmp_interaction_plots <- tmp_plot_int_imp + tmp_plot_int_time + tmp_plot_int_date +
plot_layout(nrow = 1) + plot_annotation(tag_levels = 'a', tag_suffix = ")")) # Arrange in 1 row, adds a), b), c)
# show different percentile values for each predictor
quantile(env_analysis$imp_mean, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 0.00000 27.32551 48.20256 62.71733 93.84142
quantile(env_analysis$minutes_since_8am, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 10 106 197 318 636
quantile(env_analysis$relative_day, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 1 39 50 75 148
This graph shows predicted species-level ant foraging activity as a function of ground temperature across different levels of impervious surface cover (expressed in % of ground covered by impervious surfaces in a 100m radius around the experiment), time at which the experiment started (expressed in minutes since 8:00 AM), and date of experiment (expressed as a relative date, with the experiment done earliest in the year as day 1). The predictions are based on a zero-inflated Poisson GLMM with the number of ant individuals per species as a response and the maximum observed number of individuals per species included as an offset to account for species-level differences in colony size and foraging strategies. Predicted values reflect the expected number of individuals for an average ant species, shown here with the 95% confidence interval. The predicted values at the 0th, 25th, 50th, 75th, and 100th percentiles of impervious surface cover (0th = 0%, 25th = 27.33%, 50th = 48.20%, 75th = 62.72%, 100th = 93.84%), experiment start time (0th = 10 min or 8:10 AM, 25th = 106 min or 9:46 AM, 50th = 197 min or 11:17 AM, 75th = 318 min or 1:18 PM, 100th = 636 min or 6:36 PM) and experiment date (0th = day 1 or April 12, 25th = day 39 or May 20, 50th = day 50 or May 31, 75th = day 75 or June 25, 100th = day 148 or September 6) are shown.
To visualize how each environmental variable affects ant foraging activity and see which variables have the strongest effect, I can plot the effect sizes of the model.
# show model IRRs (exp(estimates))
tmp_model1_fixed_effects <- tidy(model1, effects = "fixed") %>%
mutate(IRR = exp(estimate)) %>%
select(term, estimate, std.error, p.value, IRR)
tmp_model1_fixed_effects
## # A tibble: 16 × 5
## term estimate std.error p.value IRR
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.889 0.530 9.34e- 2 0.411
## 2 cent_t2 -0.0343 0.0122 5.05e- 3 0.966
## 3 cent_minutes_since_8am 0.00292 0.000282 4.27e- 25 1.00
## 4 cent_relative_day -0.0269 0.00209 4.47e- 38 0.973
## 5 cent_imp -0.0500 0.00487 1.01e- 24 0.951
## 6 vol_moist_mean -0.0114 0.00419 6.51e- 3 0.989
## 7 cloudslightly cloudy -2.06 0.116 6.44e- 70 0.128
## 8 cloudsvery cloudy -2.19 0.133 2.02e- 61 0.112
## 9 windweak wind -0.624 0.0911 7.40e- 12 0.536
## 10 windstrong wind 0.687 0.109 3.54e- 10 1.99
## 11 shadehalf-shade -1.12 0.0684 1.31e- 60 0.325
## 12 shadefull shade -0.910 0.0874 2.20e- 25 0.403
## 13 cent_t2:cent_minutes_since_8am -0.000163 0.0000597 6.24e- 3 1.00
## 14 cent_t2:cent_relative_day 0.00467 0.000374 1.16e- 35 1.00
## 15 cent_t2:cent_imp 0.00106 0.000303 4.98e- 4 1.00
## 16 (Intercept) 2.43 0.0833 1.28e-186 11.3
# plot fixed effects with IRR directly
(tmp_effect_sizes <- plot_model(model1, type = "est", show.values = TRUE,
value.offset = 0.3, digits = 4, dot.size = 1, value.size = 3) +
geom_hline(yintercept = 1, color = "black", linetype = 3) + # add dashed vertical line at 1
theme_bw() +
ggtitle("Effect sizes of predictors on ant foraging activity") +
theme(plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(size = 9),
axis.text.y = element_text(size = 9, hjust = 1)) +
scale_x_discrete(labels = c( # rename the predictors
"cent_t2" = "Ground temperature (centered)",
"cent_imp" = "Impervious surface cover (centered)",
"vol_moist_mean" = "Soil moisture",
"cent_relative_day" = "Date of experiment (centered)",
"cent_minutes_since_8am" = "Experiment start time (centered)",
"clouds1" = "Cloud cover (lightly cloudy)",
"clouds2" = "Cloud cover (very cloudy)",
"wind1" = "Wind (weak wind)",
"wind2" = "Wind (strong wind)",
"shade1" = "Shade (half-shade)",
"shade2" = "Shade (full shade)",
"cent_t2:cent_minutes_since_8am" = "Interaction of temperature and start time (centered)",
"cent_t2:cent_relative_day" = "Interaction of temperature and date (centered)",
"cent_t2:cent_imp" = "Interaction of temperature and impervious surface (centered)"
)) # Custom labels for the predictors
)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# plot random effects
plot_model(model1, type = "re", vline.color = "black")
## [[1]]
##
## [[2]]
rm(list = ls(pattern = "^tmp_"))
The overarching research question is: Can ants differentiate between baits of different sugar concentrations, and do they forage more actively on high sugar concentration baits?
More specifically, is there a difference in the average number of ants collected on the baits with different sugar concentrations at the end of the baiting experiments ? Is there an increase in the number of ants present on baits with a higher sugar concentration ?
This research question is treated in two parts: first, I test the effect of sugar concentration on ant foraging activity in terms of presence or absence of ants on the different sugar concentrations. Then, when ants are present on the experiments, I test the effect of sugar concentrations on and foraging activity in terms of species-specific abundance of ants on the different sugar concentrations.
Description of tested variables
Two response variables:
Explanatory variables: the sugar concentration of the sugar-water solutions added to the baits (either discrete quantitative or ordinal categorical)
Random effects:
I start by modelling the effect of sugar concentration on the presence or absence of ants on the baits. The question is: are ants more likely to be present on certain sugar concentrations than others ?
I calculate the proportion of baits where ants were present and absent, then I can visualize ant presence vs sugar concentration.This is done both for the species-level table (sugar_analysis) and the overall table (sugar_analysis_overall).
I need to model overall presence and absence, not species-level presence because it adds a lot of “artificial/structural” zeros from species which were not observed on an experiment (most experiments only had 1-2 species present, up to 5).
# calculate proportion of presence at species-level
table(sugar_analysis$presence)
##
## 0 1
## 27222 453
prop.table(table(sugar_analysis$presence)) # Proportion of 1s and 0s
##
## 0 1
## 0.98363144 0.01636856
# calculate overall proportion of presence
table(sugar_analysis_overall$presence)
##
## 0 1
## 645 380
prop.table(table(sugar_analysis_overall$presence)) # Proportion of 1s and 0s
##
## 0 1
## 0.6292683 0.3707317
# visualize ant presence vs concentration at species-level
ggplot(na.omit(sugar_analysis), aes(x = factor(concentration), fill = factor(presence))) +
geom_bar(position = "fill", color = "black") + # add black outline
scale_fill_manual(values = c("white", "black"),
labels = c("Absence", "Presence")) +
labs(x = "Sugar concentration (%)", y = "Proportion of baits occupied", fill = "") +
theme_bw() +
theme(panel.grid.major = element_blank(), # Remove major grid
panel.grid.minor = element_blank(), # Remove minor grid
axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(margin = margin(t = 10)))
# visualize ant presence vs concentration overall
(tmp_raw_presence_plot <- ggplot(na.omit(sugar_analysis_overall), aes(x = factor(concentration), fill = factor(presence))) +
geom_bar(position = "fill", color = "black", size = 0.2) + # add black outline
scale_fill_manual(values = c("white", "black"),
labels = c("Absence", "Presence")) + # Apply Acadia discrete color scale
labs(x = "Sugar concentration (%)", y = "Observed proportion of baits occupied by ants", fill = "") +
theme_bw() +
theme(panel.grid.major = element_blank(), # Remove major grid
panel.grid.minor = element_blank(), # Remove minor grid
legend.position = "right", # set legend position
legend.text = element_text(size = 8), # set legend text
legend.key.size = unit(0.4, "cm"), # Makes the legend squares smaller
axis.title.y = element_text(size = 10, margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(size = 10, margin = margin(t = 10))))
For testing the influence of sugar concentration on ant presence or absence, my response variable is binary (1 = ants are present on a given bait, 0 = no ants were found on the bait). For binary data with random effects (the site variable), I run a logistic GLMM.
The sugar concentration predictor can either be considered as a discrete numerical variable or a ordinal categorical variable. I will fit two models with these two options and check which model performs best.
I am modelling the overall ant presence/absence, NOT the species-level data.
# fit model with numerical concentration
tmp_model3_pres_num <- glmer(presence ~ as.numeric(concentration) + (1 | site),
data = sugar_analysis_overall, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
# show summary table to ensure the model has converged (no warnings)
summary(tmp_model3_pres_num)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: presence ~ as.numeric(concentration) + (1 | site)
## Data: sugar_analysis_overall
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 1159.7 1174.5 -576.9 1153.7 1007
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2603 -0.7111 -0.3302 0.8102 4.1420
##
## Random effects:
## Groups Name Variance Std.Dev.
## site (Intercept) 1.894 1.376
## Number of obs: 1010, groups: site, 36
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.301343 0.269468 -4.829 1.37e-06 ***
## as.numeric(concentration) 0.043983 0.005328 8.255 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## as.nmrc(cn) -0.327
# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model3_pres_num <- simulateResiduals(fittedModel = tmp_model3_pres_num)
plot(tmp_sim_model3_pres_num)
# fit model with categorical concentration
tmp_model3_pres_cat <- glmer(presence ~ as.factor(concentration) + (1 | site),
data = sugar_analysis_overall, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
# show summary table to ensure the model has converged (no warnings)
summary(tmp_model3_pres_cat)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: presence ~ as.factor(concentration) + (1 | site)
## Data: sugar_analysis_overall
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 1137.3 1166.8 -562.7 1125.3 1004
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.1386 -0.6349 -0.2993 0.7563 5.7872
##
## Random effects:
## Groups Name Variance Std.Dev.
## site (Intercept) 2.015 1.419
## Number of obs: 1010, groups: site, 36
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1250 0.3307 -6.427 1.30e-10 ***
## as.factor(concentration)5 1.3603 0.2700 5.038 4.71e-07 ***
## as.factor(concentration)10 1.6683 0.2675 6.238 4.44e-10 ***
## as.factor(concentration)20 1.7733 0.2680 6.616 3.70e-11 ***
## as.factor(concentration)40 2.4261 0.2715 8.934 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) as.()5 a.()10 a.()20
## as.fctr(c)5 -0.515
## as.fctr()10 -0.523 0.640
## as.fctr()20 -0.524 0.640 0.649
## as.fctr()40 -0.529 0.639 0.649 0.651
# simulate residuals and check model assumptions with DHARMa package
tmp_sim_model3_pres_cat <- simulateResiduals(fittedModel = tmp_model3_pres_cat)
plot(tmp_sim_model3_pres_cat)
compare_performance(tmp_model3_pres_num, tmp_model3_pres_cat)
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights)
## ----------------------------------------------------------------
## tmp_model3_pres_num | glmerMod | 1159.7 (<.001) | 1159.8 (<.001)
## tmp_model3_pres_cat | glmerMod | 1137.3 (>.999) | 1137.4 (>.999)
##
## Name | BIC (weights) | R2 (cond.) | R2 (marg.) | ICC | RMSE
## ------------------------------------------------------------------------------
## tmp_model3_pres_num | 1174.5 (0.021) | 0.410 | 0.070 | 0.365 | 0.419
## tmp_model3_pres_cat | 1166.8 (0.979) | 0.447 | 0.109 | 0.380 | 0.411
##
## Name | Sigma | Log_loss | Score_log | Score_spherical
## --------------------------------------------------------------------
## tmp_model3_pres_num | 1.000 | 0.522 | -Inf | 0.003
## tmp_model3_pres_cat | 1.000 | 0.507 | -Inf | 0.003
When comparing model performance and model diagnostics plots, it appears that the model with concentration as a factor performs significantly better, so that is the model I use for the analysis.
# final model
model3_presence <- glmer(presence ~ as.factor(concentration) + (1 | site),
data = sugar_analysis_overall, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
# Check for overdispersion with performance package
check_overdispersion(model3_presence)
## # Overdispersion test
##
## dispersion ratio = 0.979
## p-value = 0.52
## No overdispersion detected.
# check model assumptions and residuals with performance package
check_model(model3_presence, verbose = T)
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
## Failed to compute posterior predictive checks with `re_formula=NULL`.
## Trying again with `re_formula=NA` now.
# check residuals with DHARMa package
tmp_model3_pres_sim <- simulateResiduals(fittedModel = model3_presence)
plot(tmp_model3_pres_sim)
The model shows no signs of overdispersion, the outlier test performed by the DHARMa package is not significant, and the residuals vs predicted values plot as well as the binned residuals plot do not show blatant or strong patterns in the residuals.
# show model results
Anova(model3_presence, type = "II")
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: presence
## Chisq Df Pr(>Chisq)
## as.factor(concentration) 83.15 4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model3_presence)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: presence ~ as.factor(concentration) + (1 | site)
## Data: sugar_analysis_overall
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 1137.3 1166.8 -562.7 1125.3 1004
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.1386 -0.6349 -0.2993 0.7563 5.7872
##
## Random effects:
## Groups Name Variance Std.Dev.
## site (Intercept) 2.015 1.419
## Number of obs: 1010, groups: site, 36
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1250 0.3307 -6.427 1.30e-10 ***
## as.factor(concentration)5 1.3603 0.2700 5.038 4.71e-07 ***
## as.factor(concentration)10 1.6683 0.2675 6.238 4.44e-10 ***
## as.factor(concentration)20 1.7733 0.2680 6.616 3.70e-11 ***
## as.factor(concentration)40 2.4261 0.2715 8.934 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) as.()5 a.()10 a.()20
## as.fctr(c)5 -0.515
## as.fctr()10 -0.523 0.640
## as.fctr()20 -0.524 0.640 0.649
## as.fctr()40 -0.529 0.639 0.649 0.651
# Convert model estimates to Odds Ratios
tmp_model_or <- tidy(model3_presence, effects = "fixed") %>%
mutate(OR = exp(estimate), # Convert log-odds to ORs
OR_low = exp(estimate - 1.96 * std.error), # Lower CI
OR_high = exp(estimate + 1.96 * std.error)) # Upper CI
print(tmp_model_or)
## # A tibble: 5 × 9
## effect term estimate std.error statistic p.value OR OR_low OR_high
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed (Intercept) -2.13 0.331 -6.43 1.30e-10 0.119 0.0625 0.228
## 2 fixed as.factor(… 1.36 0.270 5.04 4.71e- 7 3.90 2.30 6.62
## 3 fixed as.factor(… 1.67 0.267 6.24 4.44e-10 5.30 3.14 8.96
## 4 fixed as.factor(… 1.77 0.268 6.62 3.70e-11 5.89 3.48 9.96
## 5 fixed as.factor(… 2.43 0.272 8.93 4.09e-19 11.3 6.65 19.3
# report model results
report(model3_presence)
## We fitted a logistic mixed model (estimated using ML and BOBYQA optimizer) to
## predict presence with concentration (formula: presence ~
## as.factor(concentration)). The model included site as random effect (formula:
## ~1 | site). The model's total explanatory power is substantial (conditional R2
## = 0.45) and the part related to the fixed effects alone (marginal R2) is of
## 0.11. The model's intercept, corresponding to concentration = 0, is at -2.13
## (95% CI [-2.77, -1.48], p < .001). Within this model:
##
## - The effect of concentration [5] is statistically significant and positive
## (beta = 1.36, 95% CI [0.83, 1.89], p < .001; Std. beta = 1.36, 95% CI [0.83,
## 1.89])
## - The effect of concentration [10] is statistically significant and positive
## (beta = 1.67, 95% CI [1.14, 2.19], p < .001; Std. beta = 1.67, 95% CI [1.14,
## 2.19])
## - The effect of concentration [20] is statistically significant and positive
## (beta = 1.77, 95% CI [1.25, 2.30], p < .001; Std. beta = 1.77, 95% CI [1.25,
## 2.30])
## - The effect of concentration [40] is statistically significant and positive
## (beta = 2.43, 95% CI [1.89, 2.96], p < .001; Std. beta = 2.43, 95% CI [1.89,
## 2.96])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
I fitted a generalized linear mixed model (GLMM) with a binomial distribution and a logit link function (estimated using ML (Laplace Approximation) and BOBYQA optimizer) to analyze the effect of sugar concentration on the presence or absence of foraging ants on baits (formula: presence ~ as.factor(concentration)). The model included site as a random effect (formula: ~ 1|site). The model’s total explanatory power is substantial (conditional R2 = 0.45) and the part related to the fixed effects alone (marginal R2) is of 0.11. The variance of the site random effect (variance = 2.015) indicates that there is variability in ant presence across different sites.
Sugar concentration shows a positive and highly significant effect on
ant presence (Type II Wald Chi-square test, Chisq(4) = 83.16, p <
2.2e-16), showing that baits with sugar water increase the probability
of ants being present on the baits compared to pure water baits.
Compared to the reference level (0% sugar), the odds of ant presence
increased significantly at all concentrations. More specifically, for
each concentration (as obtained by Wald z-tests):
- The effect of 5% sugar is statistically significant and positive (beta
= 1.36, SE = 0.27, z = 5.04, p = 4.70e-07)
- The effect of 10% sugar is statistically significant and positive
(beta = 1.67, SE = 0.27,z = 6.24, p = 4.44e-10)
- The effect of 20% sugar is statistically significant and positive
(beta = 1.77, SE = 0.27, z = 6.62 , p = 3.69e-11)
- The effect of 40% sugar is statistically significant and positive
(beta = 2.43, SE = 0.27, z = 8.94, p < 2e-16)
I can now run a post-hoc test to test for significant differences between the concentrations.
# compute pairwise comparisons between the concentrations
tmp_model3_pres_emm <- emmeans(model3_presence, pairwise ~ concentration, adjust = "tukey")
print(tmp_model3_pres_emm)
## $emmeans
## concentration emmean SE df asymp.LCL asymp.UCL
## 0 -2.125 0.331 Inf -2.773 -1.477
## 5 -0.765 0.301 Inf -1.354 -0.176
## 10 -0.457 0.297 Inf -1.039 0.126
## 20 -0.352 0.297 Inf -0.934 0.231
## 40 0.301 0.297 Inf -0.281 0.883
##
## Results are given on the logit (not the response) scale.
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df z.ratio p.value
## concentration0 - concentration5 -1.360 0.270 Inf -5.038 <.0001
## concentration0 - concentration10 -1.668 0.267 Inf -6.238 <.0001
## concentration0 - concentration20 -1.773 0.268 Inf -6.616 <.0001
## concentration0 - concentration40 -2.426 0.272 Inf -8.934 <.0001
## concentration5 - concentration10 -0.308 0.228 Inf -1.350 0.6599
## concentration5 - concentration20 -0.413 0.228 Inf -1.809 0.3682
## concentration5 - concentration40 -1.066 0.230 Inf -4.629 <.0001
## concentration10 - concentration20 -0.105 0.224 Inf -0.468 0.9902
## concentration10 - concentration40 -0.758 0.226 Inf -3.358 0.0070
## concentration20 - concentration40 -0.653 0.225 Inf -2.898 0.0308
##
## Results are given on the log odds ratio (not the response) scale.
## P value adjustment: tukey method for comparing a family of 5 estimates
To test for pairwise differences between the different sugar concentration levels, I conducted a post-hoc analysis using Tukey-adjusted Wald z-tests for pairwise comparisons of estimated marginal means (EMMs) on the logit scale. As described above, all sugar concentrations greater than 0% had a significantly positive effect on ant presence compared to pure water (p < 0.0001).
However, comparisons between intermediate concentrations revealed that there was no significant difference between 5% sugar and 10% sugar (beta = -0.31, SE = 0.29, z = -1.35, p = 0.66), 5% sugar and 20% sugar (beta = -0.41, SE = 0.23, z = -1.81, p = 0.37), or 10% sugar and 20% sugar (beta = -0.11, SE = 0.22, z = -0.47, p = 0.99). In contrast, the 40% sugar concentration had significantly greater ant presence than all other concentrations: 5% sugar (beta = -1.07, SE = 0.23, z = -4.63, p <.0001), 10% sugar (beta = -0.76. SE = 0.23, z = -3.36, p = 0.007), and 20% sugar (beta = -0.65, SE = 0.23, z = -2.90, P = 0.03), indicating a nonlinear response to sugar concentration.
I can plot the predicted probabilities from the model and the results of the pairwise comparisons:
# get predicted probabilities (transforme log-odds)
tmp_emm_preds <- emmeans(model3_presence, ~ concentration, type = "response")
# Convert to a data frame for plotting
tmp_emm_preds_df <- as.data.frame(tmp_emm_preds)
# get pairwise contrasts
tmp_posthoc_results <- pairs(tmp_emm_preds, adjust = "tukey")
# convert to compact letter display (CLD) format
tmp_cld_results <- multcomp::cld(tmp_emm_preds, Letters = letters)
# Merge with prediction data
tmp_emm_preds_df$letters <- tmp_cld_results$.group
(tmp_presence_pred_plot <- ggplot(tmp_emm_preds_df, aes(x = as.factor(concentration), y = prob)) +
geom_point(size = 2) + # Points for predicted probabilities
geom_errorbar(aes(ymin = asymp.LCL, ymax = asymp.UCL), width = 0.1, size = 0.3) + # 95% CI
geom_text(aes(label = letters, y = prob),
nudge_x = 0.2, # move label horizontally (increase to move it right)
size = 4) + # Set text size
labs(x = "Sugar concentration (%)", y = "Predicted proportion of baits occupied by ants") +
theme_bw() +
theme(axis.title.y = element_text(size = 10, margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(size = 10, margin = margin(t = 10))))
# combine both presence plots into 1 plot
(tmp_presence_plots <- tmp_raw_presence_plot + tmp_presence_pred_plot +
plot_layout(nrow = 1) + plot_annotation(tag_levels = 'a', tag_suffix = ")")) # Arrange in 1 row, adds a), b), c)
rm(list = ls(pattern = "^tmp_"))
Now that I have tested whether sugar concentration affects the likelihood of ants being present at baits, I can look at the second part of the research question: when ants are present, do more ants recruit to the higher sugar concentrations ?
For this analysis, I will only look at experiments where at least one ant was present. For each experiment, I will only keep the rows for which the species total for that experiment is > 0 (i.e., that species was present on at least 1 bait of the experiment).
# remove experiments with no ants
sugar_analysis_abundance <- sugar_analysis %>%
group_by(sample_ID) %>% # group by experiment
dplyr::filter(any(individuals > 0, na.rm = T)) %>% # keep only exp where at least 1 individual is found
ungroup()
# only keep rows of species that were present on the experiment
sugar_analysis_abundance <- sugar_analysis_abundance %>%
filter(species_total_exp > 0)
The data frame now has 5 rows per species found on a given experiment (1 per concentration). I.e., for an experiment with 3 species found on it in total, there will be 1 row per species found per concentration, so 3 * 5 = 15 rows. An experiment were only 1 species was found will have 1 * 5 = 5 rows.
# Histogram of raw counts (individuals)
ggplot(na.omit(sugar_analysis_abundance), aes(x = individuals)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Distribution of ant counts on baits", x = "Species-level ant count per bait", y = "Frequency") +
theme_classic2() +
theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(margin = margin(t = 10)))
# number of individuals of each species vs sugar concentration
ggplot(na.omit(sugar_analysis_abundance), aes(x = as.factor(concentration), y = individuals)) +
geom_boxplot() +
labs(x = "Sugar concentration (%)", y = "Species-level number of individuals") +
theme_classic2() +
theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(margin = margin(t = 10)))
# scaled ant activity per species vs concentration
(tmp_raw_abundance_plot <- ggplot(na.omit(sugar_analysis_abundance), aes(x = as.factor(concentration), y = scaled_species_total)) +
geom_boxplot(size = 0.3) +
labs(x = "Sugar concentration (%)", y = "Observed species-level proportion of individuals") +
theme_bw() +
theme(panel.grid.major = element_blank(), # Remove major grid
panel.grid.minor = element_blank(),
axis.title.y = element_text(size = 10, margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(size = 10, margin = margin(t = 10))))
# for plotting overall (non species-specific) abundance, remove exp with 0 ants from sugar_analysis_overall
sugar_analysis_overall_abundance <- sugar_analysis_overall %>%
group_by(sample_ID) %>% # group by experiment
dplyr::filter(any(individuals > 0, na.rm = T)) %>% # keep only exp where at least 1 individual is found
ungroup()
# number of individuals overall vs sugar concentration
ggplot(na.omit(sugar_analysis_overall_abundance), aes(x = as.factor(concentration), y = individuals)) +
geom_boxplot() +
labs(x = "Sugar concentration (%)", y = "Overall number of individuals") +
theme_classic2() +
theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(margin = margin(t = 10)))
# scaled ant activity overall vs concentration
ggplot(na.omit(sugar_analysis_overall_abundance), aes(x = as.factor(concentration), y = scaled_ant_total)) +
geom_boxplot() +
labs(x = "Sugar concentration (%)", y = "Overall proportion of individuals") +
theme_classic2() +
theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(margin = margin(t = 10)))
I want to test the influence of sugar concentration on ant abundance. My response variable is based on count data: I look at the number of ants of one species present on one concentration of one experiment, offset by the total number of ants of that species on that experiment (i.e., the proportion of ants of one species on each bait of one experiment).
Since my response variable is a proportion, I can fit a binomial GLMM with the number of individuals of one species on one bait as a “success”, and the number of ants on all other baits (total number of ants of the experiments - number of ants on the bait in question) as a “failure”. If the data shows significant overdispersion, I can try to fit a beta-binomial GLMM.
The sugar concentration predictor can either be considered as a discrete numerical variable or a ordinal categorical variable. I also fit two models with these two options and check which model performs best.
# Binomial GLMM model with numerical concentration
tmp_model3_ab_num <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~
concentration + (1 | site) + (1 | sp_abbr),
family = binomial(link = "logit"),
data = sugar_analysis_abundance)
# Check for overdispersion with performance package
check_overdispersion(tmp_model3_ab_num)
## # Overdispersion test
##
## dispersion ratio = 1.538
## p-value = < 0.001
## Overdispersion detected.
# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = tmp_model3_ab_num)
plot(tmp_model3_ab_sim)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
# Binomial GLMM model with categorical concentration
tmp_model3_ab_cat <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~
as.factor(concentration) + (1 | site) + (1 | sp_abbr),
family = binomial(link = "logit"),
data = sugar_analysis_abundance)
# Check for overdispersion with performance package
check_overdispersion(tmp_model3_ab_cat)
## # Overdispersion test
##
## dispersion ratio = 1.545
## p-value = < 0.001
## Overdispersion detected.
# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = tmp_model3_ab_cat)
plot(tmp_model3_ab_sim)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
Both the overdispersion test from the performance
package as well as the plots from the DHARMa package show
overdisperion (with concentration as a factor or as a continuous
variable). A binomial GLMM would thus not fit my data.
I try to fit a beta-binomial GLMM.
# Beta-binomial GLMM with numerical concentration
tmp_model3_ab_num <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~
concentration + (1 | site) + (1 | sp_abbr),
family = betabinomial, # Beta-binomial family
data = sugar_analysis_abundance)
# show summary table to ensure the model has converged (no warnings)
summary(tmp_model3_ab_num)
## Family: betabinomial ( logit )
## Formula:
## cbind(individuals, species_total_exp - individuals) ~ concentration +
## (1 | site) + (1 | sp_abbr)
## Data: sugar_analysis_abundance
##
## AIC BIC logLik deviance df.resid
## 3056.6 3081.5 -1523.3 3046.6 1071
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## site (Intercept) 1.188e-09 3.447e-05
## sp_abbr (Intercept) 7.827e-11 8.847e-06
## Number of obs: 1076, groups: site, 32; sp_abbr, 27
##
## Dispersion parameter for betabinomial family (): 1.68
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.226033 0.090701 -24.54 <2e-16 ***
## concentration 0.051403 0.003648 14.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion with performance package
check_overdispersion(tmp_model3_ab_num)
## # Overdispersion test
##
## dispersion ratio = 0.924
## p-value = 0.792
## No overdispersion detected.
# check model assumptions and residuals with performance package
check_model(tmp_model3_ab_num, verbose = T)
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
## `check_outliers()` does not yet support models of class `glmmTMB`.
## Using `ci_type = "gaussian"` because model is not bernoulli.
# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = tmp_model3_ab_num)
plot(tmp_model3_ab_sim)
# fit model with categorical concentration
tmp_model3_ab_cat <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~
as.factor(concentration) + (1 | site) + (1 | sp_abbr),
family = betabinomial, # Beta-binomial family
data = sugar_analysis_abundance)
# show summary table to ensure the model has converged (no warnings)
summary(tmp_model3_ab_cat)
## Family: betabinomial ( logit )
## Formula:
## cbind(individuals, species_total_exp - individuals) ~ as.factor(concentration) +
## (1 | site) + (1 | sp_abbr)
## Data: sugar_analysis_abundance
##
## AIC BIC logLik deviance df.resid
## 3033.5 3073.3 -1508.7 3017.5 1068
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## site (Intercept) 3.959e-10 1.990e-05
## sp_abbr (Intercept) 5.966e-10 2.443e-05
## Number of obs: 1076, groups: site, 32; sp_abbr, 27
##
## Dispersion parameter for betabinomial family (): 1.74
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.8967 0.1823 -15.893 < 2e-16 ***
## as.factor(concentration)5 0.9732 0.2186 4.453 8.48e-06 ***
## as.factor(concentration)10 1.4581 0.2118 6.883 5.84e-12 ***
## as.factor(concentration)20 1.9305 0.2113 9.138 < 2e-16 ***
## as.factor(concentration)40 2.5757 0.2080 12.381 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for overdispersion with performance package
check_overdispersion(tmp_model3_ab_cat)
## # Overdispersion test
##
## dispersion ratio = 0.970
## p-value = 0.976
## No overdispersion detected.
# check model assumptions and residuals with performance package
check_model(tmp_model3_ab_cat, verbose = T)
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
## `check_outliers()` does not yet support models of class `glmmTMB`.
## Using `ci_type = "gaussian"` because model is not bernoulli.
# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = tmp_model3_ab_cat)
plot(tmp_model3_ab_sim)
compare_performance(tmp_model3_ab_num, tmp_model3_ab_cat)
## Random effect variances not available. Returned R2 does not account for random effects.
## Random effect variances not available. Returned R2 does not account for random effects.
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights) | BIC (weights)
## ------------------------------------------------------------------------------
## tmp_model3_ab_num | glmmTMB | 3056.6 (<.001) | 3056.6 (<.001) | 3081.5 (0.017)
## tmp_model3_ab_cat | glmmTMB | 3033.5 (>.999) | 3033.6 (>.999) | 3073.3 (0.983)
##
## Name | R2 (cond.) | R2 (marg.) | RMSE | Sigma | Log_loss
## ----------------------------------------------------------------------
## tmp_model3_ab_num | | 0.379 | 0.314 | 1.682 |
## tmp_model3_ab_cat | | 0.471 | 0.313 | 1.736 |
The beta-binomial model shows no signs of overdispersion and the residuals vs predicted values plot does not show blatant or strong patterns in the residuals. A beta-binomial GLMM is an appropriate for my data.
When comparing model performance and model diagnostics plots, it appears that the model with concentration as a factor performs better, so that is the model I use for the analysis.
# final model
model3_abundance <- glmmTMB(cbind(individuals, species_total_exp - individuals) ~
as.factor(concentration) + (1 | site) + (1 | sp_abbr),
family = betabinomial, # Beta-binomial family
data = sugar_analysis_abundance)
# Check for overdispersion with performance package
check_overdispersion(model3_abundance) # overdispersion
## # Overdispersion test
##
## dispersion ratio = 0.970
## p-value = 0.976
## No overdispersion detected.
# check model assumptions with performance package
check_model(model3_abundance, verbose = T)
## Not enough model terms in the conditional part of the model to check for
## multicollinearity.
## `check_outliers()` does not yet support models of class `glmmTMB`.
## Using `ci_type = "gaussian"` because model is not bernoulli.
# check residuals with DHARMa package
tmp_model3_ab_sim <- simulateResiduals(fittedModel = model3_abundance)
plot(tmp_model3_ab_sim)
The beta-binomial model shows no signs of overdispersion, the outlier
test performed by the DHARMa package is not significant,
and the residuals vs predicted values plot does not show blatant or
strong patterns in the residuals. The diagnostics plots from the
performance package show no clear deviation from model
assumptions.
# show model results
Anova(model3_abundance, type = "II")
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: cbind(individuals, species_total_exp - individuals)
## Chisq Df Pr(>Chisq)
## as.factor(concentration) 195.72 4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model3_abundance)
## Family: betabinomial ( logit )
## Formula:
## cbind(individuals, species_total_exp - individuals) ~ as.factor(concentration) +
## (1 | site) + (1 | sp_abbr)
## Data: sugar_analysis_abundance
##
## AIC BIC logLik deviance df.resid
## 3033.5 3073.3 -1508.7 3017.5 1068
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## site (Intercept) 3.959e-10 1.990e-05
## sp_abbr (Intercept) 5.966e-10 2.443e-05
## Number of obs: 1076, groups: site, 32; sp_abbr, 27
##
## Dispersion parameter for betabinomial family (): 1.74
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.8967 0.1823 -15.893 < 2e-16 ***
## as.factor(concentration)5 0.9732 0.2186 4.453 8.48e-06 ***
## as.factor(concentration)10 1.4581 0.2118 6.883 5.84e-12 ***
## as.factor(concentration)20 1.9305 0.2113 9.138 < 2e-16 ***
## as.factor(concentration)40 2.5757 0.2080 12.381 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Convert model estimates to Odds Ratios
tmp_model_ab_or <- tidy(model3_abundance, effects = "fixed") %>%
mutate(OR = exp(estimate), # Convert log-odds to ORs
OR_low = exp(estimate - 1.96 * std.error), # Lower CI
OR_high = exp(estimate + 1.96 * std.error)) # Upper CI
print(tmp_model_ab_or)
## # A tibble: 5 × 10
## effect component term estimate std.error statistic p.value OR OR_low
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed cond (Interc… -2.90 0.182 -15.9 7.03e-57 0.0552 0.0386
## 2 fixed cond as.fact… 0.973 0.219 4.45 8.48e- 6 2.65 1.72
## 3 fixed cond as.fact… 1.46 0.212 6.88 5.84e-12 4.30 2.84
## 4 fixed cond as.fact… 1.93 0.211 9.14 6.38e-20 6.89 4.56
## 5 fixed cond as.fact… 2.58 0.208 12.4 3.30e-35 13.1 8.74
## # ℹ 1 more variable: OR_high <dbl>
# report model results
report(model3_abundance)
## Random effect variances not available. Returned R2 does not account for random effects.
## Can't calculate log-loss.
## Can't calculate proper scoring rules for models without integer response
## values.
## Warning in text == "" | text2 == "": longer object length is not a multiple of
## shorter object length
## Warning in text == "" | text2 == "": longer object length is not a multiple of
## shorter object length
## Random effect variances not available. Returned R2 does not account for random effects.
## Can't calculate log-loss.
## Can't calculate proper scoring rules for models without integer response
## values.
## We fitted a logistic mixed model (estimated using ML and nlminb optimizer) to
## predict cbind(individuals, species_total_exp - individuals) with concentration
## (formula: cbind(individuals, species_total_exp - individuals) ~
## as.factor(concentration)). The model included site as random effects (formula:
## list(~1 | site, ~1 | sp_abbr)). The model's explanatory power related to the
## fixed effects alone (marginal R2) is 0.47. The model's intercept, corresponding
## to concentration = 0, is at -2.90 (95% CI [-3.25, -2.54], p < .001). Within
## this model:
##
## - The effect of concentration [10] is statistically significant and positive
## (beta = 0.97, 95% CI [0.54, 1.40], p < .001; Std. beta = 1.46, 95% CI [1.04,
## 1.87])
## - The effect of concentration [20] is statistically significant and positive
## (beta = 1.46, 95% CI [1.04, 1.87], p < .001; Std. beta = 1.93, 95% CI [1.52,
## 2.34])
## - The effect of concentration [40] is statistically significant and positive
## (beta = 1.93, 95% CI [1.52, 2.34], p < .001; Std. beta = 2.58, 95% CI [2.17,
## 2.98])
## - The intercept is statistically significant and positive (beta = 2.58, 95% CI
## [2.17, 2.98], p < .001; Std. beta = -2.90, 95% CI [-3.25, -2.54])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation. and We fitted a logistic
## mixed model (estimated using ML and nlminb optimizer) to predict
## cbind(individuals, species_total_exp - individuals) with concentration
## (formula: cbind(individuals, species_total_exp - individuals) ~
## as.factor(concentration)). The model included site as random effects (formula:
## list(~1 | site, ~1 | sp_abbr)). The model's explanatory power related to the
## fixed effects alone (marginal R2) is 0.47. The model's intercept, corresponding
## to concentration = 0, is at 1.74 (95% CI [1.48, 2.04]). Within this model:
##
## - The effect of concentration [10] is statistically significant and positive
## (beta = 0.97, 95% CI [0.54, 1.40], p < .001; Std. beta = 1.46, 95% CI [1.04,
## 1.87])
## - The effect of concentration [20] is statistically significant and positive
## (beta = 1.46, 95% CI [1.04, 1.87], p < .001; Std. beta = 1.93, 95% CI [1.52,
## 2.34])
## - The effect of concentration [40] is statistically significant and positive
## (beta = 1.93, 95% CI [1.52, 2.34], p < .001; Std. beta = 2.58, 95% CI [2.17,
## 2.98])
## - The intercept is statistically significant and positive (beta = 2.58, 95% CI
## [2.17, 2.98], p < .001; Std. beta = -2.90, 95% CI [-3.25, -2.54])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
I fitted a generalized linear mixed model (GLMM) with a beta-binomial distribution (estimated using ML and nlminb optimizer) to analyze the effect of sugar concentration on the species-level abundance (modeled as the species-specific proportion of ants on each sugar concentration bait) of foraging ants on baits (formula: cbind(individuals, species_total_exp - individuals) ~ as.factor(concentration)). The model included random intercepts for site (formula: ~ 1|site) as well as species (formula: ~ 1|sp_abbr). The model’s explanatory power related to the fixed effects alone (marginal R2) is 0.47. The very small variance of the random effects of site (variance = 3.959e-10) and species (variance = 5.966e-10) indicates that there is little site- or species-level variation in ant abundance once sugar concentration is accounted for.
The model fit yielded an AIC of 3033.5 and a BIC of 3073.3, with a log-likelihood of -1508.7 and a deviance of 3017.5. The dispersion parameter for the beta-binomial family was estimated to be 1.74, suggesting moderate dispersion in the data, which is appropriate for this family of distributions. The model included 1076 observations, with random effects for 32 sites and 27 species.
Sugar concentration shows a positive and highly significant effect on
ant presence (Type II Wald Chi-square test, Chisq(4) = 195.72, p <
2.2e-16), showing that baits with sugar water increase the species-level
abundance of ants on the baits compared to pure water baits. Compared to
the reference level (0% sugar), species-level ant abundance increased
significantly at all concentrations. More specifically, for each
concentration (as obtained by Wald z-tests):
- The effect of 5% sugar is statistically significant and positive (beta
= 0.97, SE = 0.22, z = 4.45, p = 8.48e-06)
- The effect of 10% sugar is statistically significant and positive
(beta = 1.46, SE = 0.21,z = 6.88, p = 5.84e-12)
- The effect of 20% sugar is statistically significant and positive
(beta = 1.93, SE = 0.21, z = 9.14, p < 2e-16)
- The effect of 40% sugar is statistically significant and positive
(beta = 2.58, SE = 0.21, z = 12.38, p < 2e-16)
I can now run a post-hoc test to test for significant differences between the concentrations.
# compute pairwise comparisons between the concentrations
tmp_model3_ab_emm <- emmeans(model3_abundance, pairwise ~ concentration, adjust = "tukey")
print(tmp_model3_ab_emm)
## $emmeans
## concentration emmean SE df asymp.LCL asymp.UCL
## 0 -2.897 0.182 Inf -3.25 -2.540
## 5 -1.924 0.131 Inf -2.18 -1.666
## 10 -1.439 0.117 Inf -1.67 -1.209
## 20 -0.966 0.110 Inf -1.18 -0.750
## 40 -0.321 0.102 Inf -0.52 -0.122
##
## Results are given on the logit (not the response) scale.
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df z.ratio p.value
## concentration0 - concentration5 -0.973 0.219 Inf -4.453 0.0001
## concentration0 - concentration10 -1.458 0.212 Inf -6.883 <.0001
## concentration0 - concentration20 -1.931 0.211 Inf -9.138 <.0001
## concentration0 - concentration40 -2.576 0.208 Inf -12.381 <.0001
## concentration5 - concentration10 -0.485 0.171 Inf -2.841 0.0363
## concentration5 - concentration20 -0.957 0.170 Inf -5.644 <.0001
## concentration5 - concentration40 -1.602 0.165 Inf -9.686 <.0001
## concentration10 - concentration20 -0.472 0.159 Inf -2.967 0.0251
## concentration10 - concentration40 -1.118 0.154 Inf -7.239 <.0001
## concentration20 - concentration40 -0.645 0.150 Inf -4.306 0.0002
##
## Results are given on the log odds ratio (not the response) scale.
## P value adjustment: tukey method for comparing a family of 5 estimates
To test for pairwise differences between the different sugar concentration levels, I conducted a post-hoc analysis using Tukey-adjusted Wald z-tests for pairwise comparisons of estimated marginal means (EMMs) on the logit scale.
The estimated marginal means for each sugar concentration level are shown below. These results are given on the logit scale.
Concentration Emmean SE 95% CI Lower Bound 95% CI Upper Bound 0% -2.897 0.182 -3.25 -2.540 5% -1.924 0.131 -2.18 -1.666 10% -1.439 0.117 -1.67 -1.209 20% -0.966 0.110 -1.18 -0.750 40% -0.321 0.102 -0.52 -0.122
The pairwise contrasts for sugar concentration levels (with Tukey adjustment for multiple comparisons) are summarized below.
Contrast Estimate SE z-ratio p-value concentration 0% - 5% -0.973 0.219 -4.453 0.0001 concentration 0% - 10% -1.458 0.212 -6.883 <.0001 concentration 0% - 20% -1.931 0.211 -9.138 <.0001 concentration 0% - 40% -2.576 0.208 -12.381 <.0001 concentration 5% - 10% -0.485 0.171 -2.841 0.0363 concentration 5% - 20% -0.957 0.170 -5.644 <.0001 concentration 5% - 40% -1.602 0.165 -9.686 <.0001 concentration 10% - 20% -0.472 0.159 -2.967 0.0251 concentration 10% - 40% -1.118 0.154 -7.239 <.0001 concentration 20% - 40% -0.645 0.150 -4.306 0.0002
As described above, all sugar concentrations greater than 0% had a significantly positive effect on ant abundance compared to pure water (p < 0.0001). Additionally, all concentrations were significantly different from each other (see Table, p < 0.05).
I can plot the predicted probabilities from the model and the results of the pairwise comparisons:
# get predicted probabilities (transform log-odds)
tmp_emm_preds <- emmeans(model3_abundance, ~ concentration, type = "response")
# Convert to a data frame for plotting
tmp_emm_preds_df <- as.data.frame(tmp_emm_preds)
# get pairwise contrasts
tmp_posthoc_results <- pairs(tmp_emm_preds, adjust = "tukey")
# convert to compact letter display (CLD) format
tmp_cld_results <- multcomp::cld(tmp_emm_preds, Letters = letters)
# Merge with prediction data
tmp_emm_preds_df$letters <- tmp_cld_results$.group
(tmp_pred_abundance_plot <- ggplot(tmp_emm_preds_df, aes(x = as.factor(concentration), y = prob)) +
geom_point(size = 2) + # Points for predicted probabilities
geom_errorbar(aes(ymin = asymp.LCL, ymax = asymp.UCL), width = 0.1, size = 0.3) + # 95% CI
geom_text(aes(label = letters, y = prob, x = 1:5),
nudge_x = 0.25, # move label horizontally (increase to move it right)
size = 4) + # Set text size
labs(x = "Sugar concentration (%)", y = "Predicted species-level proportion of individuals") +
theme_bw() +
theme(axis.title.y = element_text(margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(margin = margin(t = 10)))
)
# combine both abundance plots into 1 plot
(tmp_abundance_plots <- tmp_raw_abundance_plot + tmp_pred_abundance_plot +
plot_layout(nrow = 1) + plot_annotation(tag_levels = 'a', tag_suffix = ")")) # Arrange in 1 row, adds a), b), c)
rm(list = ls(pattern = "^tmp_"))
The overarching research question is: if ants can differentiate between baits of different sugar concentrations, how fast does that differentiation happen ? Do they immediately preferentially recruit to higher concentration baits, or does this preference establish over the time of the experiment ?
In other words, does the proportion of ants on the different sugar concentration baits largely stay the same throughout the experiment, or does the proportion of ants on higher concentrations increase during the 2 hours of the experiment ?
Description of tested variables
I can find the number of experiments where no ant showed up at any time point. 210 experiments had at least 1 ant at 1 time point, meaning 239 - 210 = 29 experiments had no ants for the entire duration.
#remove experiments with no ants
tmp_speed_analysis_no_0 <- speed_analysis %>%
group_by(sample_ID) %>% # group by experiment
dplyr::filter(any(individuals > 0, na.rm = T)) %>% # keep only exp where at least 1 individual is found
ungroup()
# number of exp with at least 1 ant over the duration of the experiment
length(unique(tmp_speed_analysis_no_0$sample_ID))
## [1] 210
# numbe rof experiments with no ants at any time
length(unique(speed_analysis$sample_ID)) - length(unique(tmp_speed_analysis_no_0$sample_ID))
## [1] 29
I can also plot the distribution of the number of ant individuals found on one bait at one time point. The histogram shows that the vast majority of baits at one time point had no ants on them.
# Histogram of ant counts per bait
ggplot(na.omit(speed_analysis), aes(x = individuals)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of ant counts on baits at one time point", x = "Number of ants", y = "Frequency")
# Histogram of ant counts per bait, removing exp with 0 ants
ggplot(na.omit(tmp_speed_analysis_no_0), aes(x = individuals)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
labs(title = "Distribution of ant counts on baits at one time point \n(experiments with at least 1 ant present)", x = "Number of ants", y = "Frequency")
# plot ants per experiment across time
ggplot(na.omit(speed_analysis), aes(x = count_time, y = total_ants_time, group = sample_ID)) +
geom_line(aes(color = sample_ID), alpha = 0.7) +
geom_point() +
labs(title = "Total ant counts over time", x = "Time since experiment start (minutes)", y = "Number of ants per experiment")+
theme(legend.position = "none")
# plot ants per concentration over time
ggplot(na.omit(tmp_speed_analysis_no_0), aes(x = count_time, y = scaled_ants_time, color = concentration, group = concentration)) +
#geom_point() +
geom_smooth(method = "glm", span = 0.9, se = F, size = 1.5)+
scale_color_viridis_d() +
scale_x_continuous(breaks = c(5, 10, 20, 40, 80, 120), limits = c(0, 122))+
theme_classic()+
labs(x = "Time since experiment start (minutes)", y = "Proportion of ants", color = "Sugar concentration")
## `geom_smooth()` using formula = 'y ~ x'
I want to test whether the influence of sugar concentration on ant abundance changes over the time of the experiment. My response variable is based on count data: I look at the number of ants present on one concentration at one counting time point, offset by the total number of ants on that experiment at that time point (i.e., the proportion of ants on each bait of one experiment at each counting time point).
Since my response variable is a proportion, I can fit a binomial GLMM with the number of individuals on one bait at one time as a “success”, and the number of ants on all other baits at that time points (total number of ants on the experiment - number of ants on the bait in question) as a “failure”. If the data shows significant overdispersion, I can try to fit a beta-binomial GLMM.
The sugar concentration predictor can either be considered as a discrete numerical variable or a ordinal categorical variable. The time variable can be considered a categorical variable (since the counting time points are not linearly distributed) or as a numerical variable (which can also be log-transformed to have a linearized time variable). I fit several models with these different options and check which model performs best.
# binomial GLMM with numerical time
tmp_model4_bin <- glmer(cbind(individuals, total_ants_time - individuals) ~
concentration * count_time + (1 | site),
family = binomial,
data = na.omit(speed_analysis))
## boundary (singular) fit: see help('isSingular')
# check for overdispersion
check_overdispersion(tmp_model4_bin)
## # Overdispersion test
##
## dispersion ratio = 1.446
## p-value = < 0.001
## Overdispersion detected.
# check model residuals
tmp_model4_simres <- simulateResiduals(tmp_model4_bin)
plot(tmp_model4_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
The binomial GLMM shows overdispersion. I will try to fit a beta-binomial GLMM and compare the performance of both models.
# beta-binomial GLMM
tmp_model4_betabin <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~
concentration * count_time + (1 | site),
family = betabinomial(link = "logit"),
data = na.omit(speed_analysis))
# check for overdispersion
check_overdispersion(tmp_model4_betabin)
## # Overdispersion test
##
## dispersion ratio = 0.894
## p-value = 0.488
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(tmp_model4_betabin)
plot(tmp_model4_bb_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
# compare initial model performance with beta-binomial GLMM
compare_performance(tmp_model4_betabin, tmp_model4_bin)
## Random effect variances not available. Returned R2 does not account for random effects.
## boundary (singular) fit: see help('isSingular')
## Random effect variances not available. Returned R2 does not account for random effects.
## boundary (singular) fit: see help('isSingular')
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights)
## -----------------------------------------------------------------
## tmp_model4_betabin | glmmTMB | 9499.1 (>.999) | 9499.2 (>.999)
## tmp_model4_bin | glmerMod | 16568.1 (<.001) | 16568.2 (<.001)
##
## Name | BIC (weights) | R2 (cond.) | R2 (marg.) | RMSE | Sigma | Log_loss
## -----------------------------------------------------------------------------------------
## tmp_model4_betabin | 9581.2 (>.999) | | 0.373 | 0.294 | 2.666 |
## tmp_model4_bin | 16643.4 (<.001) | | 0.744 | 0.274 | 1.000 |
The beta-binomial model performs significantly better than the binomial GLMM, and will be used for further analysis.
The counting time can be modeled either as a categorical variable, as a numerical variable or as a log-transformed numerical variable. I model all three and compare model performance.
# beta-binomial GLMM with categorical count times
tmp_model4_betabin_cat <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~
concentration * as.factor(count_time) + (1 | site),
family = betabinomial(link = "logit"),
data = na.omit(speed_analysis))
# check for overdispersion
check_overdispersion(tmp_model4_betabin_cat)
## # Overdispersion test
##
## dispersion ratio = 0.890
## p-value = 0.552
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(tmp_model4_betabin_cat)
plot(tmp_model4_bb_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
# beta-binomial GLMM with log-transformed numerical count times
tmp_model4_betabin_log <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~
concentration * log(count_time) + (1 | site),
family = betabinomial(link = "logit"),
data = na.omit(speed_analysis))
# check for overdispersion
check_overdispersion(tmp_model4_betabin_log)
## # Overdispersion test
##
## dispersion ratio = 0.895
## p-value = 0.544
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(tmp_model4_betabin_log)
plot(tmp_model4_bb_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
# compare both model performances
compare_performance(tmp_model4_betabin, tmp_model4_betabin_cat, tmp_model4_betabin_log)
## Random effect variances not available. Returned R2 does not account for random effects.
## Random effect variances not available. Returned R2 does not account for random effects.
## Random effect variances not available. Returned R2 does not account for random effects.
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights)
## ------------------------------------------------------------------
## tmp_model4_betabin | glmmTMB | 9499.1 (0.833) | 9499.2 (0.833)
## tmp_model4_betabin_cat | glmmTMB | 9530.2 (<.001) | 9530.5 (<.001)
## tmp_model4_betabin_log | glmmTMB | 9502.4 (0.167) | 9502.4 (0.167)
##
## Name | BIC (weights) | R2 (cond.) | R2 (marg.) | RMSE
## -------------------------------------------------------------------------
## tmp_model4_betabin | 9581.2 (0.833) | | 0.373 | 0.294
## tmp_model4_betabin_cat | 9749.0 (<.001) | | 0.382 | 0.294
## tmp_model4_betabin_log | 9584.4 (0.167) | | 0.373 | 0.294
##
## Name | Sigma | Log_loss
## -----------------------------------------
## tmp_model4_betabin | 2.666 |
## tmp_model4_betabin_cat | 2.675 |
## tmp_model4_betabin_log | 2.671 |
The model with count time as a numerical variable performs much better than the categorical count time model and the log(time) model, so I continue the analysis with time as a numerical variable.
I also compare models where sugar concentration is modeled as a numerical variable and as a categorical variable.
# beta-binomial GLMM with numerical concentration
tmp_model4_betabin_num <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~
as.numeric(as.character(concentration)) * count_time + (1 | site),
family = betabinomial(link = "logit"),
data = na.omit(speed_analysis))
# check for overdispersion
check_overdispersion(tmp_model4_betabin_num)
## # Overdispersion test
##
## dispersion ratio = 0.886
## p-value = 0.52
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(tmp_model4_betabin_num)
plot(tmp_model4_bb_simres)
# compare both model performances
compare_performance(tmp_model4_betabin, tmp_model4_betabin_num)
## Random effect variances not available. Returned R2 does not account for random effects.
## Random effect variances not available. Returned R2 does not account for random effects.
## # Comparison of Model Performance Indices
##
## Name | Model | AIC (weights) | AICc (weights)
## ------------------------------------------------------------------
## tmp_model4_betabin | glmmTMB | 9499.1 (>.999) | 9499.2 (>.999)
## tmp_model4_betabin_num | glmmTMB | 9576.6 (<.001) | 9576.6 (<.001)
##
## Name | BIC (weights) | R2 (cond.) | R2 (marg.) | RMSE
## -------------------------------------------------------------------------
## tmp_model4_betabin | 9581.2 (>.999) | | 0.373 | 0.294
## tmp_model4_betabin_num | 9617.6 (<.001) | | 0.290 | 0.295
##
## Name | Sigma | Log_loss
## -----------------------------------------
## tmp_model4_betabin | 2.666 |
## tmp_model4_betabin_num | 2.517 |
The model with concentration as a categorical variable performs much better than the numerical concentration model, so I continue the analysis with concentration as a categorical variable.
# define final model
model4 <- glmmTMB(cbind(individuals, total_ants_time - individuals) ~
concentration * count_time + (1 | site),
family = betabinomial(link = "logit"),
data = na.omit(speed_analysis))
# check for overdispersion
check_overdispersion(model4)
## # Overdispersion test
##
## dispersion ratio = 0.894
## p-value = 0.488
## No overdispersion detected.
# check model residuals with DHARMa package
tmp_model4_bb_simres <- simulateResiduals(model4)
plot(tmp_model4_bb_simres)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
The beta-binomial model shows no signs of overdispersion (tested with
the performance package) and the residuals vs predicted
values plot as well as the Q-Q plot from the DHARMa package
do not show blatant or strong patterns in the residuals.
The model is a beta-binomial GLMM.
# type III anova table
options(contrasts = c("contr.sum", "contr.poly")) # specify sum-to-zero contrasts
Anova(model4, type = "III")
## Analysis of Deviance Table (Type III Wald chisquare tests)
##
## Response: cbind(individuals, total_ants_time - individuals)
## Chisq Df Pr(>Chisq)
## (Intercept) 237.544 1 < 2.2e-16 ***
## concentration 96.891 4 < 2.2e-16 ***
## count_time 15.609 1 7.789e-05 ***
## concentration:count_time 32.528 4 1.492e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# show model summary
summary(model4)
## Family: betabinomial ( logit )
## Formula:
## cbind(individuals, total_ants_time - individuals) ~ concentration *
## count_time + (1 | site)
## Data: na.omit(speed_analysis)
##
## AIC BIC logLik deviance df.resid
## 9499.1 9581.2 -4737.6 9475.1 6878
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## site (Intercept) 7.902e-10 2.811e-05
## Number of obs: 6890, groups: site, 39
##
## Dispersion parameter for betabinomial family (): 2.67
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.085425 0.135308 -15.412 < 2e-16 ***
## concentration5 0.360732 0.177800 2.029 0.04247 *
## concentration10 0.368354 0.176698 2.085 0.03710 *
## concentration20 1.154534 0.169624 6.806 1.00e-11 ***
## concentration40 1.257059 0.164890 7.624 2.47e-14 ***
## count_time -0.007682 0.001944 -3.951 7.79e-05 ***
## concentration5:count_time 0.006545 0.002447 2.675 0.00747 **
## concentration10:count_time 0.010798 0.002402 4.496 6.93e-06 ***
## concentration20:count_time 0.007082 0.002341 3.025 0.00249 **
## concentration40:count_time 0.011956 0.002297 5.205 1.94e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# report main model findings
report(model4)
## Random effect variances not available. Returned R2 does not account for random effects.
## Can't calculate log-loss.
## Can't calculate proper scoring rules for models without integer response
## values.
## Warning in text == "" | text2 == "": longer object length is not a multiple of
## shorter object length
## Warning in text == "" | text2 == "": longer object length is not a multiple of
## shorter object length
## Random effect variances not available. Returned R2 does not account for random effects.
## Can't calculate log-loss.
## Can't calculate proper scoring rules for models without integer response
## values.
## We fitted a logistic mixed model (estimated using ML and nlminb optimizer) to
## predict cbind(individuals, total_ants_time - individuals) with concentration
## and count_time (formula: cbind(individuals, total_ants_time - individuals) ~
## concentration * count_time). The model included site as random effect (formula:
## ~1 | site). The model's explanatory power related to the fixed effects alone
## (marginal R2) is 0.37. The model's intercept, corresponding to concentration =
## and count_time = 0, is at -2.09 (95% CI [-2.35, -1.82], p < .001). Within this
## model:
##
## - The effect of concentration [10] is statistically significant and negative
## (beta = -7.68e-03, 95% CI [-0.01, -3.87e-03], p < .001; Std. beta = -0.30, 95%
## CI [-0.42, -0.18])
## - The effect of concentration [20] is statistically significant and positive
## (beta = 1.26, 95% CI [0.93, 1.58], p < .001; Std. beta = -0.10, 95% CI [-0.22,
## 0.02])
## - The effect of concentration [40] is statistically significant and positive
## (beta = 0.36, 95% CI [0.01, 0.71], p = 0.042; Std. beta = 0.52, 95% CI [0.40,
## 0.63])
## - The effect of count time is statistically significant and positive (beta =
## 0.37, 95% CI [0.02, 0.71], p = 0.037; Std. beta = -0.02, 95% CI [-0.07, 0.04])
## - The effect of concentration [5] × count time is statistically significant and
## positive (beta = 1.15, 95% CI [0.82, 1.49], p < .001; Std. beta = -0.30, 95% CI
## [-0.43, -0.17])
## - The effect of concentration [10] × count time is statistically significant
## and positive (beta = 7.08e-03, 95% CI [2.49e-03, 0.01], p = 0.002; Std. beta =
## -0.03, 95% CI [-0.14, 0.08])
## - The effect of concentration [20] × count time is statistically significant
## and positive (beta = 0.01, 95% CI [7.45e-03, 0.02], p < .001; Std. beta = 0.15,
## 95% CI [0.04, 0.25])
## - The effect of concentration [40] × count time is statistically significant
## and positive (beta = 6.54e-03, 95% CI [1.75e-03, 0.01], p = 0.007; Std. beta =
## -8.02e-03, 95% CI [-0.11, 0.09])
## - The intercept is statistically significant and positive (beta = 0.01, 95% CI
## [6.09e-03, 0.02], p < .001; Std. beta = -1.48, 95% CI [-1.54, -1.41])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation. and We fitted a logistic
## mixed model (estimated using ML and nlminb optimizer) to predict
## cbind(individuals, total_ants_time - individuals) with concentration and
## count_time (formula: cbind(individuals, total_ants_time - individuals) ~
## concentration * count_time). The model included site as random effect (formula:
## ~1 | site). The model's explanatory power related to the fixed effects alone
## (marginal R2) is 0.37. The model's intercept, corresponding to concentration =
## and count_time = 0, is at 2.67 (95% CI [2.43, 2.93]). Within this model:
##
## - The effect of concentration [10] is statistically significant and negative
## (beta = -7.68e-03, 95% CI [-0.01, -3.87e-03], p < .001; Std. beta = -0.30, 95%
## CI [-0.42, -0.18])
## - The effect of concentration [20] is statistically significant and positive
## (beta = 1.26, 95% CI [0.93, 1.58], p < .001; Std. beta = -0.10, 95% CI [-0.22,
## 0.02])
## - The effect of concentration [40] is statistically significant and positive
## (beta = 0.36, 95% CI [0.01, 0.71], p = 0.042; Std. beta = 0.52, 95% CI [0.40,
## 0.63])
## - The effect of count time is statistically significant and positive (beta =
## 0.37, 95% CI [0.02, 0.71], p = 0.037; Std. beta = -0.02, 95% CI [-0.07, 0.04])
## - The effect of concentration [5] × count time is statistically significant and
## positive (beta = 1.15, 95% CI [0.82, 1.49], p < .001; Std. beta = -0.30, 95% CI
## [-0.43, -0.17])
## - The effect of concentration [10] × count time is statistically significant
## and positive (beta = 7.08e-03, 95% CI [2.49e-03, 0.01], p = 0.002; Std. beta =
## -0.03, 95% CI [-0.14, 0.08])
## - The effect of concentration [20] × count time is statistically significant
## and positive (beta = 0.01, 95% CI [7.45e-03, 0.02], p < .001; Std. beta = 0.15,
## 95% CI [0.04, 0.25])
## - The effect of concentration [40] × count time is statistically significant
## and positive (beta = 6.54e-03, 95% CI [1.75e-03, 0.01], p = 0.007; Std. beta =
## -8.02e-03, 95% CI [-0.11, 0.09])
## - The intercept is statistically significant and positive (beta = 0.01, 95% CI
## [6.09e-03, 0.02], p < .001; Std. beta = -1.48, 95% CI [-1.54, -1.41])
##
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
I fitted a generalized linear mixed model (GLMM) with a beta-binomial distribution (estimated using ML and nlminb optimizer) to analyze the effect of sugar concentration, time since experiment start and the interaction of time and sugar concentration on the abundance of foraging ants on baits (modeled as the proportion of ants on each sugar concentration bait at each counting time) (formula: cbind(individuals, total_ants_time - individuals) ~ concentration * count_time). The model included a random intercept for site (formula: ~ 1|site). The model’s explanatory power related to the fixed effects alone (marginal R2) is 0.37. The very small variance of the random effect (variance = 7.902e-10) indicates that there is little site-level variation in ant abundance once sugar concentration and counting time is accounted for.
The model fit yielded an AIC of 9499.1 and a BIC of 9581.2, with a log-likelihood of -4737.6 and a deviance of 9475.1. The dispersion parameter for the beta-binomial family was estimated to be 2.67, which is appropriate for this family of distributions. The model included 6890 observations, with random effects for 39 sites.
The type III Wald chi-square tests from the analysis of deviance table indicate a highly significant effect of sugar concentration (Chisq(4) = 96.891, p < 2.2e-16), time since experiment start (Chisq(1) = 15.609, p = 7.789e-05), and of the interaction of sugar concentration and time since experiment time (Chisq(4) = 32.528, p = 1.492e-06) on ant foraging, suggesting that the proportion of ants on each sugar concentration was not uniform and that it changed over the course of the experiment.
I can now run a post-hoc analysis to look at the specific slope of the proportion of ants on each sugar concentration bait across experiment time, as well as to test whether there are significant differences in the slopes between concentrations.
I also run a post-hoc analysis to compare the predicted proportion of ants on each sugar concentrations at the first count time since the of the experiment (count_time = 5), i.e. to test whether there are immediate preferences for specific concentrations.
# Get estimated slopes (trends) of count_time for each concentration
tmp_slope_comparisons <- emtrends(model4, pairwise ~ concentration, var = "count_time", adjust = "tukey")
# Display results
summary(tmp_slope_comparisons, test = T)
## $emtrends
## concentration count_time.trend SE df asymp.LCL asymp.UCL
## 0 -0.00768 0.00194 Inf -0.011493 -0.00387
## 5 -0.00114 0.00149 Inf -0.004054 0.00178
## 10 0.00312 0.00141 Inf 0.000349 0.00588
## 20 -0.00060 0.00131 Inf -0.003164 0.00196
## 40 0.00427 0.00122 Inf 0.001881 0.00667
##
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df z.ratio p.value
## concentration0 - concentration5 -0.006545 0.00245 Inf -2.675 0.0577
## concentration0 - concentration10 -0.010798 0.00240 Inf -4.496 0.0001
## concentration0 - concentration20 -0.007082 0.00234 Inf -3.025 0.0210
## concentration0 - concentration40 -0.011956 0.00230 Inf -5.205 <.0001
## concentration5 - concentration10 -0.004253 0.00205 Inf -2.075 0.2311
## concentration5 - concentration20 -0.000537 0.00198 Inf -0.271 0.9988
## concentration5 - concentration40 -0.005411 0.00193 Inf -2.810 0.0398
## concentration10 - concentration20 0.003716 0.00192 Inf 1.933 0.2999
## concentration10 - concentration40 -0.001158 0.00187 Inf -0.620 0.9719
## concentration20 - concentration40 -0.004875 0.00179 Inf -2.722 0.0508
##
## P value adjustment: tukey method for comparing a family of 5 estimates
# Use emmeans to get marginal means for concentrations at count_time = 5
tmp_emmeans_results <- emmeans(model4, ~ concentration | count_time, at = list(count_time = 5))
# Perform pairwise contrasts between concentrations at count_time = 5
tmp_contrast_results <- contrast(tmp_emmeans_results, method = "pairwise")
# Display the results
summary(tmp_contrast_results)
## count_time = 5:
## contrast estimate SE df z.ratio p.value
## concentration0 - concentration5 -0.3935 0.168 Inf -2.341 0.1321
## concentration0 - concentration10 -0.4223 0.167 Inf -2.527 0.0847
## concentration0 - concentration20 -1.1899 0.160 Inf -7.419 <.0001
## concentration0 - concentration40 -1.3168 0.156 Inf -8.450 <.0001
## concentration5 - concentration10 -0.0289 0.155 Inf -0.187 0.9997
## concentration5 - concentration20 -0.7965 0.147 Inf -5.414 <.0001
## concentration5 - concentration40 -0.9234 0.142 Inf -6.486 <.0001
## concentration10 - concentration20 -0.7676 0.146 Inf -5.268 <.0001
## concentration10 - concentration40 -0.8945 0.141 Inf -6.340 <.0001
## concentration20 - concentration40 -0.1269 0.132 Inf -0.958 0.8738
##
## Results are given on the log odds ratio (not the response) scale.
## P value adjustment: tukey method for comparing a family of 5 estimates
To compare the effect of time since experiment start on ant foraging activity across the different sugar concentrations, I conducted a post-hoc analysis using Tukey-adjusted Wald z-tests for pairwise comparisons of the estimated slopes (trends) of the relationship between time since experiment time and ant foraging activity for each sugar concentration on the logit scale.
The estimated slopes of the relationship between time and ant foraging activity for each sugar concentration level are shown below. These results are given on the logit scale.
Concentration Slope (ant activity vs time) SE 95% CI Lower Bound 95% CI Upper Bound 0% -0.00768 0.00194 -0.011493 -0.00387 5% -0.00114 0.00149 -0.004054 0.00178 10% 0.00312 0.00141 0.000349 0.00588 20% -0.00060 0.00131 -0.003164 0.00196 40% 0.00427 0.00122 0.001881 0.00667
The table shows that on baits with 0% sugar, the proportion of ants on the bait decreases across the time of the experiment (slope = -0.00768, 95% CI = [-0.011, -0.004]). On baits with 5% sugar (slope = -0.001, 95% CI = [-0.004, 0.002]) and 20% sugar (slope = -0.0006, 95% CI = [-0.003, 0.002]), the proportion of ants on the baits does not significantly change over the time of the experiment, as shown by slopes very close to 0 and 95% confidence intervals overlapping with 0. Baits with 10% sugar (slope = 0.003, 95% CI = [0.0003, 0.006]) and 40% sugar (slope = 0.004, 95% CI = [0.002, 0.007]) show an increase in the proportion of ants present on the baits over the time of the experiment.
The pairwise contrasts for sugar concentration levels (with Tukey adjustment for multiple comparisons) are summarized below.
Contrast Estimate SE z-ratio p-value concentration 0% - 5% -0.006545 0.00245 -2.675 0.0577 . concentration 0% - 10% -0.010798 0.00240 -4.496 0.0001 concentration 0% - 20% -0.007082 0.00234 -3.025 0.0210 concentration 0% - 40% -0.011956 0.00230 -5.205 <.0001 concentration 5% - 10% -0.004253 0.00205 -2.075 0.2311 concentration 5% - 20% -0.000537 0.00198 -0.271 0.9988 concentration 5% - 40% -0.005411 0.00193 -2.810 0.0398 concentration 10% - 20% 0.003716 0.00192 1.933 0.2999 concentration 10% - 40% -0.001158 0.00187 -0.620 0.9719 concentration 20% - 40% -0.004875 0.00179 -2.722 0.0508 .
The relationship between time since experiment start and ant activity is only marginally different on 5% baits form 0% baits (see table, p = 0.0577). The slope for 10%, 20%, and 40% baits is significantly different from the slope of 0% baits (see table, p < 0.05). This suggests that ant activity changed at different rates over the course of the experiment on low concentration baits (0%, 5%) than on higher concentration baits (10%, 20%, 40%). There is a significant difference in the slope on 5% baits and 40% baits (see table, p = 0.0398), as well as a marginally significant difference between the slopes on 20% baits and 40% baits (see table, p = 0.0508). However, the differences between 5%, 10%, and 20% are not significant (see table, p > 0.05), suggesting that the trends in the proportion of ants on these baits is similar over the time of the experiment.
To test whether there is a significant difference in the predicted
proportion of ants at the start of the experiment (0 minutes since
experiment start), I conducted a post-hoc analysis using Tukey-adjusted
Wald z-tests for pairwise comparisons of the estimated marginal means
(EMMs) for each sugar concentration while holding
count_time constant at 5.
The estimated marginal means for each sugar concentration level at 5 minutes since experiment start are shown below. These results are given on the logit scale.
Contrast Estimate SE z-ratio p-value concentration 0% - 5% -0.36073 0.178 -2.029 0.2521 concentration 0% - 10% -0.36835 0.177 -2.085 0.2266 concentration 0% - 20% -1.15453 0.170 -6.806 <.0001 concentration 0% - 40% -1.25706 0.165 -7.624 <.0001 concentration 5% - 10% -0.00762 0.163 -0.047 1.0000 concentration 5% - 20% -0.79380 0.155 -5.114 <.0001 concentration 5% - 40% -0.89633 0.150 -5.966 <.0001 concentration 10% - 20% -0.78618 0.154 -5.117 <.0001 concentration 10% - 40% -0.88871 0.149 -5.973 <.0001 concentration 20% - 40% -0.10253 0.140 -0.733 0.9487
As can be seen on the table, there is no significant difference in the predicted proportion of ants at the start of the experiment between the baits with 0%, 5%, and 10% sugar (see table, p > 0.05). Similarly, there is no significant difference in the predicted proportion of ants on the 20% and 40% baits at the start of the experiment (see table, p = 0.95). However, there is a highly significant difference in the predicted proportion of ants at the start of the experiment between these two groups of baits (0%, 5%, 10% vs. 20%, 40%) (see table, p < 0.0001). This shows that according to the model’s predictions, from the start of the experiment there is a higher proportion of ants on the baits with the highest sugar concentrations (20% sugar and 40% sugar) than on the baits with lower sugar concentrations (0% sugar, 5% sugar, and 10% sugar).
I can plot the predicted probabilities from the model:
# define the counting times over which to generate predictions
tmp_count_time_seq <- c(5, 10, 20, 40, 80, 120)
# Use emmeans() to predict over this sequence of count_time values, while keeping concentration as a factor
tmp_emm_time_preds <- emmeans(model4, ~ count_time | concentration, at = list(count_time = tmp_count_time_seq), type = "response")
# Convert the predictions to a dataframe
tmp_emm_time_df <- as.data.frame(tmp_emm_time_preds)
# Plot using ggplot2
(tmp_sugar_vs_time_plot <- ggplot(tmp_emm_time_df, aes(x = count_time, y = prob, color = factor(concentration))) +
geom_line() +
geom_point(size = 2) +
geom_errorbar(aes(ymin = asymp.LCL, ymax = asymp.UCL), width = 0.2) + # 95% CI
labs(title = "",
x = "Time since experiment start (minutes)",
y = "Predicted proportion of individuals",
color = "Sugar concentration (%)") +
scale_color_viridis_d() +
scale_x_continuous(breaks = c(5, 10, 20, 40, 80, 120), limits = c(0, 122))+
theme_bw() +
theme(panel.grid.major = element_blank(), # Remove major grid
panel.grid.minor = element_blank(),
axis.title.y = element_text(size = 10, margin = margin(r = 10), hjust = 0.5), # adjust distance from axis title to axis, center title
axis.title.x = element_text(size = 10, margin = margin(t = 10)),
legend.title = element_text(size = 9, hjust = 0.5),
legend.text = element_text(size = 8))
)
rm(list = ls(pattern = "^tmp_"))